Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/avx/Makefile b/kernel/avx/Makefile
new file mode 100644
index 0000000..f260086
--- /dev/null
+++ b/kernel/avx/Makefile
@@ -0,0 +1,54 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_12_lib4.o kernel_dgemv_8_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o kernel_dgebp_lib4.o
+OBJS += kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgetr_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/avx/kernel_dgebp_lib4.S b/kernel/avx/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..0e8581e
--- /dev/null
+++ b/kernel/avx/kernel_dgebp_lib4.S
@@ -0,0 +1,935 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+//                               1      2          3        4          5
+// void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_lib4
+	.type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_lib4
+	.def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmovapd	0(%r11, %r12, 1), %ymm4
+	vmovapd	32(%r11, %r12, 1), %ymm5
+	vmovapd	64(%r11, %r12, 1), %ymm6
+	vmovapd	96(%r11, %r12, 1), %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3        4          5          6        7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+	movq	ARG7, %rax // km
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC01(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmaskmovpd	0(%r11, %r12, 1), %ymm15, %ymm4
+	vmaskmovpd	32(%r11, %r12, 1), %ymm15, %ymm5
+	vmaskmovpd	64(%r11, %r12, 1), %ymm15, %ymm6
+	vmaskmovpd	96(%r11, %r12, 1), %ymm15, %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+//                               1      2          3          4
+// void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_lib4
+	.type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_lib4
+	.def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3          4          5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC00(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmaskmovpd	0(%r11), %ymm15, %ymm0
+	vmaskmovpd	32(%r11), %ymm15, %ymm1
+	vmaskmovpd	64(%r11), %ymm15, %ymm2
+	vmaskmovpd	96(%r11), %ymm15, %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+	.align 5
+#endif
+	.double 0.5
+	.double 1.5
+	.double 2.5
+	.double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+	.align 5
+#endif
+	.double 4.5
+	.double 5.5
+	.double 6.5
+	.double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+	.align 5
+#endif
+	.double 8.5
+	.double 9.5
+	.double 10.5
+	.double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_dgemm_4x4_lib4.S b/kernel/avx/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..95ff6ea
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9906 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+
+//	cmpl	$3, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	subl	$1, %r10d
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	cmpl	$4, %r10d
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+//	cmpl	$3, %r10d
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	48(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	56(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	88(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-24(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, -32(%r12)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r10), %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r14d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r14d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A+3*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$8, %r12 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r14d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$16, %r12 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r14d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r14d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r14d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r10), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r11
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif	
+#endif	
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif	
+#endif	
+	
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender_loader for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif	
+#endif	
+	
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_lib4, @function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10), %ymm12
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+	vmovapd		32(%r10), %ymm12
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+	vmovapd		64(%r10), %ymm12
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl	$3, %r12d
+	jle		0f
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+0:
+	cmpl	$2, %r12d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+1:
+	cmpl	$1, %r12d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+	vmovddup	%xmm14, %xmm14
+
+	// first column
+//	vblendpd	$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd		%ymm0, %ymm12
+	vmovddup	%xmm0, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 0(%r10)
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vblendpd	$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+	vmovddup	%xmm1, %xmm12
+	vperm2f128	$0x00, %ymm12, %ymm12, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vblendpd	$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 8(%r10)
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vblendpd	$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+	vmovddup	%xmm2, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vblendpd	$0x2, %ymm2, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm2, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vblendpd	$0x4, %ymm2, %ymm12, %ymm12
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vmovddup	%xmm13, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 16(%r10)
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vblendpd	$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+	vmovddup	%xmm3, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x2, %ymm3, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm3, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x4, %ymm3, %ymm12, %ymm12
+
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x8, %ymm3, %ymm12, %ymm12
+	
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+//	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 24(%r10)
+//	vmulpd		%ymm3, %ymm13, %ymm3
+	vblendpd	$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r12d
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	jl			0f // end
+	cmpl		$3, %r12d
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	jl			0f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10   <- D
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		3f
+
+0:
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			3f // end
+	vblendpd	$0x4, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x2, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   1      2              3          4          5             6            7          8        9            10         11       12      13      14      15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx       r8         r9       rsp+8         rsp+16    rsp+24     rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9           rsp+8      rsp+16   rsp+24       rsp+32     rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4            5          6        7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi            rdx        rcx          r8         r9       rsp+8        rsp+16     rsp+24   rsp+32  rsp+40  rsp+48  rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10
+	movq	ARG4, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  edi    rsi        rdx        rcx        r8         r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                     edi    rsi        rdx        rcx        r8         r9                  rsp+8   rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                           edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24             rsp+32   rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8     
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                            edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8               rsp+16  rsp+24  
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               edi     rsi         rdx         ecx     r8          r9          rsp+8    rsp+16     rsp+24     rsp+32                rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8   rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8                rsp+16  rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double  *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16  rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+	movq	ARG9, %r12  // km 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx        rcx      r8         r9         rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx        rcx      r8         r9         rsp+8               rsp+16  rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8      rsp+16  rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                             1         2           3           4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_4_lib4
+	.type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_4_lib4
+	.def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG2, %r12 // V
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG2, %r12 // V
+
+	//
+	vmovapd			0(%r11), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm0
+	//
+	vmovapd			32(%r11), %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	//
+	vmovapd			64(%r11), %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	//
+	vmovapd			96(%r11), %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	96(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+	//
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+	//
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+	//
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	104(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	112(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vmovapd			%ymm12, 96(%r12)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_8x4_lib4.S b/kernel/avx/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..e9f1f34
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,13154 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+//	movq	%r11, %r15 // A1 <- A0
+//	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmovapd 0(%r13), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 32(%r15), %ymm11 // A1[4]
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 64(%r15), %ymm9 // A1[8]
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 96(%r15), %ymm11 // A1[12]
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	addq	$128, %r11
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+	// unroll 3
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 32(%r15), %ymm11 // A1[4]
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 64(%r15), %ymm9 // A1[8]
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 96(%r15), %ymm11 // A1[12]
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+//	addq	$128, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+	// unroll 3
+//	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+//	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+//	cmpl	$3, %r10d
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	addq	$32, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r13
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+//	addq	$32, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	subl	$1, %r10d
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmovapd 0(%r13), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	// unroll 3
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+	cmpl	$4, %r10d
+
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	// unroll 3
+//	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+//	cmpl	$3, %r10d
+
+//	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+	addq	$32, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	addq	$32, %r13
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	subl	$1, %r10d
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+	prefetcht0	128(%r12, %r13, 2) // software prefetch
+	prefetcht0	192(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	48(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	56(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	88(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-24(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, -32(%r12)
+	vmovapd			%ymm14, -32(%r12, %r13, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %ebx
+	subl			%r15d, %ebx // 4-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,4-offsetB)
+
+	movl			%r15d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r13 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A0+1*bs*sizeof(float)
+	addq			$8, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm12
+	vmovapd			0(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			32(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			64(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+
+	vbroadcastsd	96(%r12), %ymm12
+	vmovapd			96(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			96(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	104(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	addq			$128, %r10
+	addq			$128, %r12
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	addq			$32, %r13
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	addq			$32, %r11
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r15d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r15d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A0+3*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$8, %r13 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r15d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A0+2*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$16, %r13 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r15d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r15d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r15d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+	
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm3, %ymm14, %ymm3
+
+	vmovapd		0(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm4, %ymm14, %ymm4
+	vmovapd		32(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm5, %ymm14, %ymm5
+	vmovapd		64(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm6, %ymm14, %ymm6
+	vmovapd		96(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm7, %ymm14, %ymm7
+
+	jmp		3f
+
+0:
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm0, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm1, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm2, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm3, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm4, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm5, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm6, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm0, %ymm1, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm2, %ymm3, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm12, %ymm14, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm13, %ymm15, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm4, %ymm5, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm6, %ymm7, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm12, %ymm14, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm13, %ymm15, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm0, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm1, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm2, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm3, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm4, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm5, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm6, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm3, %ymm14, %ymm3
+
+	vmovapd		0(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm4, %ymm14, %ymm4
+	vmovapd		32(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm5, %ymm14, %ymm5
+	vmovapd		64(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm6, %ymm14, %ymm6
+	vmovapd		96(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm7, %ymm14, %ymm7
+
+	jmp		3f
+
+0:
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	// alg==1
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x4_lib4, @function
+inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vmulpd		%ymm4, %ymm13, %ymm4
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vmulpd		%ymm5, %ymm13, %ymm5
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vmulpd		%ymm6, %ymm13, %ymm6
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm6, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+	vmulpd		%ymm7, %ymm13, %ymm7
+
+	jmp				0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x4_lib4, .-inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization vs
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vmulpd		%ymm4, %ymm13, %ymm4
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$2, %r11d
+	jl			0f // ret
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vmulpd		%ymm5, %ymm13, %ymm5
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$3, %r11d
+	jl			0f // ret
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vmulpd		%ymm6, %ymm13, %ymm6
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm6, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$4, %r11d
+	jl			0f // ret
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+	vmulpd		%ymm7, %ymm13, %ymm7
+
+	jmp				0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+	// solve top-left
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		0(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		32(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		64(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		64(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		96(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	addq		$128, %r10
+
+
+	// solve top-left
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10, %r11, 1), %ymm12
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		32(%r10, %r11, 1), %ymm12
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		64(%r10, %r11, 1), %ymm12
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x11, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+	
+	// bottom-right
+
+	vmovapd			224(%r10, %r11, 1), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r10, %r11, 1), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r10, %r11, 1), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	// bottom-right
+
+	cmpl	$7, %r13d
+	jle		0f
+
+	vmovapd			224(%r10, %r11, 1), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+0:
+	cmpl	$6, %r13d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r10, %r11, 1), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+1:
+	cmpl	$5, %r13d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r10, %r11, 1), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+2:
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd			%ymm0, %ymm12
+	vdivsd			%xmm0, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r10)
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vblendpd		$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+//	vpermpd			$0x00, %ymm1, %ymm13
+	vmovddup		%xmm1, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vblendpd		$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r10)
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vblendpd		$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+//	vpermpd			$0x00, %ymm2, %ymm13
+	vmovddup		%xmm2, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vblendpd		$0x2, %ymm2, %ymm13, %ymm12
+
+//	vpermpd			$0x55, %ymm2, %ymm13
+	vperm2f128		$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd		$0xf, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vblendpd		$0x4, %ymm2, %ymm12, %ymm12
+
+//	vpermpd			$0xaa, %ymm2, %ymm13
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd		$0x0, %ymm13, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r10)
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vblendpd		$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+//	vpermpd			$0x00, %ymm3, %ymm13
+	vmovddup		%xmm3, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x2, %ymm3, %ymm13, %ymm12
+
+//	vpermpd			$0x55, %ymm3, %ymm13
+	vperm2f128		$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd		$0xf, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x4, %ymm3, %ymm12, %ymm12
+
+//	vpermpd			$0xaa, %ymm3, %ymm13
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
+	vpermilpd		$0x0, %ymm11, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x8, %ymm3, %ymm12, %ymm12
+	
+//	vpermpd			$0xff, %ymm3, %ymm13
+//	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
+	vpermilpd		$0xf, %ymm11, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r10)
+//	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vblendpd		$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4,  0(%r10, %r11, 1)
+	vmovapd %ymm5, 32(%r10, %r11, 1)
+	vmovapd %ymm6, 64(%r10, %r11, 1)
+	vmovapd %ymm7, 96(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,   0(%r10)
+	vmovapd %ymm1,  32(%r10)
+	vmovapd %ymm2,  64(%r10)
+	vmovapd %ymm3,  96(%r10)
+
+	vmovapd %ymm4, 128(%r10)
+	vmovapd %ymm5, 160(%r10)
+	vmovapd %ymm6, 192(%r10)
+	vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	jl			0f // end
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,   0(%r10)
+	vmaskmovpd	%ymm1, %ymm15,  32(%r10)
+	vmaskmovpd	%ymm2, %ymm15,  64(%r10)
+	vmaskmovpd	%ymm3, %ymm15,  96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0,0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmovapd		%ymm4, 0(%r10, %r11, 1)
+	vmovapd		%ymm5, 32(%r10, %r11, 1)
+	vmovapd		%ymm6, 64(%r10, %r11, 1)
+	vmovapd		%ymm7, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	jl			4f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		4f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	jl			4f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm13
+#endif
+
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x4, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x2, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7          8        9          10
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4          5        6             7          8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+//                                  rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4          5        6             7          8          9       10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // km
+	movq	ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16        rsp+24     rsp+32   rsp+40     rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // C
+	movq	ARG10, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8        r9         rsp+8    rsp+16        rsp+24    rsp+32     rsp+40   rsp+48    rsp+56     rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                    rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16     rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16       rsp+24     rsp+32   rsp+40  rsp+48  rsp+56  rsp+64
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10 // A
+	movq	ARG4, %r11 // sda
+	sall	$5, %r11d // 4*sda*sizeof(double)
+	movq	ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24              rsp+32  rsp+40 
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                           rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56              rsp+64  rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               rdi     rsi         rdx       rcx          r8     r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64              rsp+72  rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                            rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32  rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40              rsp+48  rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48  tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48              rsp+56  rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG13, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12  // km
+	movq	ARG14, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	// epilogue
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                             1         2           3           4           5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_8_lib4
+	.type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_8_lib4
+	.def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	//
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm0, %ymm0
+	vaddpd			%ymm14, %ymm4, %ymm4
+	//
+	vmovapd			32(%r11), %ymm12
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm1, %ymm1
+	vaddpd			%ymm14, %ymm5, %ymm5
+	vbroadcastsd	32(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	//
+	vmovapd			64(%r11), %ymm12
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm2, %ymm2
+	vaddpd			%ymm14, %ymm6, %ymm6
+	vbroadcastsd	64(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	72(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	//
+	vmovapd			96(%r11), %ymm12
+	vmovapd			96(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm3, %ymm3
+	vaddpd			%ymm14, %ymm7, %ymm7
+	vbroadcastsd	96(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	104(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	112(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+	movq	ARG5, %r13 // sdd
+	sall	$5, %r13d
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vaddpd			%ymm14, %ymm4, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	//
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vaddpd			%ymm14, %ymm5, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+	//
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vaddpd			%ymm14, %ymm6, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+	//
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	104(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	112(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vaddpd			%ymm14, %ymm7, %ymm14
+	vmovapd			%ymm12, 96(%r12)
+	vmovapd			%ymm14, 96(%r12, %r13, 1)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_diag_lib4.c b/kernel/avx/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..d64f977
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,866 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	b_33 = _mm256_broadcast_sd( &B[3] );
+	b_33 = _mm256_mul_pd( b_33, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		A += 4*sda;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+		_mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	b_33 = _mm256_broadcast_sd( &B[3] );
+	b_33 = _mm256_mul_pd( b_33, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+		_mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22,
+		c_00,
+		d_00, d_01, d_02;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11,
+		c_00,
+		d_00, d_01;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+
+		}
+
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+
+		}
+	
+	}
+
+
+
+// A is the diagonal of a matrix, beta=0.0 case
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0,
+		sign,
+		a_00,
+		b_00,
+		d_00, d_01, d_02, d_03;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		B += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256i
+		mask;
+
+	__m256d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	mask = _mm256_set_epi64x( 1, -1, -1, -1 );
+		
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_maskstore_pd( &D[0], mask, d_00 );
+		_mm256_maskstore_pd( &D[4], mask, d_01 );
+		_mm256_maskstore_pd( &D[8], mask, d_02 );
+		_mm256_maskstore_pd( &D[12], mask, d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_maskstore_pd( &D[0], mask, d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m128d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+		
+	alpha0 = _mm_loaddup_pd( alpha );
+	beta0  = _mm_loaddup_pd( beta );
+	
+	a_00 = _mm_load_pd( &A[0] );
+	a_00 = _mm_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm_load_pd( &B[0] );
+		d_00 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[4] );
+		d_01 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[8] );
+		d_02 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[12] );
+		d_03 = _mm_mul_pd( a_00, b_00 );
+
+		c_00 = _mm_load_pd( &C[0] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_00 = _mm_add_pd( c_00, d_00 );
+		c_00 = _mm_load_pd( &C[4] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_01 = _mm_add_pd( c_00, d_01 );
+		c_00 = _mm_load_pd( &C[8] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_02 = _mm_add_pd( c_00, d_02 );
+		c_00 = _mm_load_pd( &C[12] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_03 = _mm_add_pd( c_00, d_03 );
+
+		_mm_store_pd( &D[0], d_00 );
+		_mm_store_pd( &D[4], d_01 );
+		_mm_store_pd( &D[8], d_02 );
+		_mm_store_pd( &D[12], d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm_load_pd( &B[0] );
+		d_00 = _mm_mul_pd( a_00, b_00 );
+
+		c_00 = _mm_load_pd( &C[0] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_00 = _mm_add_pd( c_00, d_00 );
+
+		_mm_store_pd( &D[0], d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	
+	}
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = A[0] * alpha0;
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+
+
+
diff --git a/kernel/avx/kernel_dgemv_12_lib4.S b/kernel/avx/kernel_dgemv_12_lib4.S
new file mode 100644
index 0000000..c51ad9a
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_12_lib4.S
@@ -0,0 +1,1322 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_12_lib4, @function
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r14 // A1 <- A0
+	addq	%r12, %r14 // A1 <- A0 + 4*sda*sizeof(double)
+	movq	%r14, %r15 // A2 <- A1
+	addq	%r12, %r15 // A2 <- A1 + 4*sda*sizeof(double)
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	0(%r15) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+	prefetcht0	64(%r15) // software prefetch
+
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	128(%r14) // software prefetch
+	prefetcht0	128(%r15) // software prefetch
+
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	
+	subl	$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmovapd	32(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmovapd	32(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	192(%r14) // software prefetch
+	prefetcht0	192(%r15) // software prefetch
+
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	64(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	64(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vbroadcastsd	24(%r13), %ymm12
+	addq	$32, %r13 // x+4
+	vmovapd	96(%r11), %ymm8
+	addq	$128, %r11 // A0+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmovapd	96(%r14), %ymm8
+	addq	$128, %r14 // A1+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmovapd	96(%r15), %ymm8
+	addq	$128, %r15 // A2+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	
+	addq	$32, %r11
+	addq	$32, %r14
+	addq	$32, %r15
+	addq	$8, %r13
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_12_lib4, .-inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_12_lib4, @function
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	256(%r11) // software prefetch
+	prefetcht0	320(%r11) // software prefetch
+
+	jl		0f // clean-up loop
+
+	movq	%r11, %r14
+	addq	%r12, %r14 // A+bs*sda
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+
+	vmovupd	0(%r13), %ymm12
+	addq	$32, %r13 // x+4
+
+	vmovapd	0(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	prefetcht0	64(%r14) // software prefetch
+
+	vmovapd	64(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	prefetcht0	128(%r14) // software prefetch
+
+	vmovapd	128(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r14) // software prefetch
+
+	vmovapd	192(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	prefetcht0	256(%r14) // software prefetch
+
+	vmovapd	256(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm8, %ymm15, %ymm8
+	
+	vmovapd	288(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm9, %ymm15, %ymm9
+	
+	prefetcht0	320(%r14) // software prefetch
+
+	vmovapd	320(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm10, %ymm15, %ymm10
+
+	vmovapd	352(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm11, %ymm15, %ymm11
+	
+//	addq	%r12, %r11 // A+bs*sda
+	movq	%r14, %r11 // A+bs*sda
+	addq	%r12, %r14 // A+bs*sda+bs*sda
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	vmovapd	128(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	vmovapd	192(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmovapd	256(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm8, %ymm15, %ymm8
+	
+	vmovapd	288(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm9, %ymm15, %ymm9
+	
+	vmovapd	320(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm10, %ymm15, %ymm10
+
+	vmovapd	352(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm11, %ymm15, %ymm11
+
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_12_lib4, .-inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_12_lib4, @function
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm1, %ymm4, %ymm1
+	vaddpd	%ymm2, %ymm5, %ymm2
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm2, %ymm15, %ymm2
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovupd		64(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_12_lib4, .-inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_12_lib4, @function
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm9, %ymm8, %ymm8
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vhaddpd	%ymm11, %ymm10, %ymm10
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x2, %ymm8, %ymm10, %ymm9
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vperm2f128	$0x13, %ymm8, %ymm10, %ymm8
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	vaddpd	%ymm8, %ymm9, %ymm2
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm2, %ymm15, %ymm2
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovupd		64(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_12_lib4, .-inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]
+// ymm1  <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3  <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_N_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_n_12_lib4, @function
+inner_blender_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm1, %ymm4, %ymm1
+	vaddpd	%ymm2, %ymm5, %ymm2
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vsubpd		%ymm2, %ymm15, %ymm2
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_n_12_lib4, .-inner_blender_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]
+// ymm1  <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3  <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_T_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_t_12_lib4, @function
+inner_blender_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm9, %ymm8, %ymm8
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vhaddpd	%ymm11, %ymm10, %ymm10
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x2, %ymm8, %ymm10, %ymm9
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vperm2f128	$0x13, %ymm8, %ymm10, %ymm8
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	vaddpd	%ymm8, %ymm9, %ymm2
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vsubpd		%ymm2, %ymm15, %ymm2
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_t_12_lib4, .-inner_blender_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_12_lib4, @function
+inner_store_12_lib4:
+#elif defined(OS_MAC)
+_inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	vmovupd %ymm2, 64(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_12_lib4, .-inner_store_12_lib4
+#endif
+#endif
+
+
+
+
+
+//                             rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_12_lib4
+	.type kernel_dgemv_n_12_lib4, @function
+kernel_dgemv_n_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_12_lib4
+_kernel_dgemv_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_12_lib4
+	.def kernel_dgemv_n_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+	// call inner blender n
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_12_lib4, .-kernel_dgemv_n_12_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi           rdx         rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_12_lib4
+	.type kernel_dgemv_t_12_lib4, @function
+kernel_dgemv_t_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_12_lib4
+_kernel_dgemv_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_12_lib4
+	.def kernel_dgemv_t_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_12_lib4, .-kernel_dgemv_t_12_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_4_lib4.S b/kernel/avx/kernel_dgemv_4_lib4.S
new file mode 100644
index 0000000..656e220
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_4_lib4.S
@@ -0,0 +1,4503 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_4_lib4, @function
+inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovapd	0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	$128, %r11
+	addq	$32, %r12
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vmovapd	0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	addq	$32, %r11
+	addq	$8, %r12
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_4_lib4, .-inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_4_lib4, @function
+inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmaskmovpd	0(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmaskmovpd	32(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmaskmovpd	64(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmaskmovpd	96(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_4_lib4, .-inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_nt_4_lib4, @function
+inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+	vmovupd	0(%r14), %ymm13
+
+	vmovapd	0(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	64(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovapd	96(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm11
+
+	vmaskmovpd	0(%r13), %ymm11, %ymm12
+	vmaskmovpd	0(%r14), %ymm11, %ymm13
+
+//	vmovupd	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	0(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	32(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	64(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	96(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+		
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	%ymm13, %ymm11, 0(%r14)
+
+	sall	$3, %r10d // *sizeof(double)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_nt_4_lib4, .-inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemv_add_t_4_lib4, @function
+inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$3, %r15d // offA*sizeof(double)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2sd	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2sd	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm13, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+	vandpd		%ymm15, %ymm14, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$4, %r10d // kmax - (4-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemv_add_t_4_lib4, .-inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+	vmovupd		0(%r13), %ymm12
+	vmovupd		0(%r14), %ymm13
+
+	vmovupd		0(%r11), %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		32(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		64(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmovupd		96(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	subq	$4, %r10
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_gen_lib4, @function
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#endif
+#endif
+
+	movl	$4, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2sd	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm11
+
+	vmaskmovpd	0(%r13), %ymm11, %ymm12
+	vmaskmovpd	0(%r14), %ymm11, %ymm13
+
+	vmaskmovpd	0(%r11), %ymm11, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	32(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	64(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmaskmovpd	96(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	%ymm13, %ymm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$3, %rax // *sizeof(double)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_gen_lib4, .-inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_lib4, @function
+inner_blend_n_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_lib4; .scl 2; .type 32; .endef
+inner_blend_n_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_lib4, .-inner_blend_n_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_lib4, @function
+inner_blend_t_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_lib4; .scl 2; .type 32; .endef
+inner_blend_t_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_lib4, .-inner_blend_t_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_4_lib4, @function
+inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_4_lib4, .-inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_m11_4_lib4, @function
+inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+	// beta
+	vmovupd		0(%r10), %ymm14
+	vsubpd		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_m11_4_lib4, .-inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_4_lib4, @function
+inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_4_lib4, .-inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vmovupd		0(%r11), %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_4_lib4, @function
+inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	vmovupd		0(%r10), %ymm14
+	vsubpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_4_lib4, .-inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_ln_inv_4_lib4, @function
+inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+	vmovapd			0(%r10), %ymm13
+	vblendpd		$0x1, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vmovapd			32(%r10), %ymm13
+	vblendpd		$0x3, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x3, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vmovapd			64(%r10), %ymm13
+	vblendpd		$0x7, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_ln_inv_4_lib4, .-inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS, variable size version
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_ln_inv_4_vs_lib4, @function
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+	vmovapd			0(%r10), %ymm13
+	vblendpd		$0x1, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+	vmovapd			32(%r10), %ymm13
+	vblendpd		$0x3, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x3, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+	vmovapd			64(%r10), %ymm13
+	vblendpd		$0x7, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	// return
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_ln_inv_4_vs_lib4, .-inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_4_lib4, @function
+inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vmovsd			88(%r10), %xmm11
+	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_4_lib4, .-inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_3_lib4, @function
+inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_3_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vmovsd			88(%r10), %xmm11
+	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+
+//	vbroadcastsd	24(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0x8, %ymm12, %ymm0, %ymm0
+	
+	cmpl			$4, %r12d
+	jl				0f
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+0:
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_3_lib4, .-inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_2_lib4, @function
+inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_2_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	cmpl			$3, %r12d
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+//	vmovsd			88(%r10), %xmm11
+//	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+//	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+	vblendpd		$0xc, %ymm14, %ymm10, %ymm10
+
+//	vbroadcastsd	24(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0xc, %ymm12, %ymm0, %ymm0
+	
+	je				0f
+	jl				1f
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+
+//	vbroadcastsd	16(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_2_lib4, .-inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope 
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_1_lib4, @function
+inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_1_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0xe, %ymm12, %ymm0, %ymm0
+	
+	cmpl			$3, %r12d
+	je				0f
+
+	cmpl			$2, %r12d
+	je				1f
+	jl				2f
+
+	vmovsd			24(%r10), %xmm10
+	vblendpd		$0xe, %ymm14, %ymm10, %ymm10
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+
+	vmovsd			16(%r10), %xmm9
+	vblendpd		$0xe, %ymm14, %ymm9, %ymm9
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+
+2:
+
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_1_lib4, .-inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k-4
+// r11   <- A+4*4*sizeof(double)
+// r12   <- x+4*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmv_un_4_lib4, @function
+inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r11), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	
+	subl			$4, %r10d
+
+	vmovapd			32(%r11), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	
+	vmovapd			64(%r11), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	addq			$128, %r11
+	addq			$32, %r12
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmv_un_4_lib4, .-inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dtrmv_ut_4_lib4, @function
+inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dtrmv_ut_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jle		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$4, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+//	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+//	vmovupd		LC02(%rip), %ymm13
+#endif
+//	vmovddup	%xmm14, %xmm14
+//	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+//	vsubpd		%ymm14, %ymm13, %ymm14
+//
+//	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovupd		0(%r13), %ymm12
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r11), %ymm8
+	vblendpd	$0x1, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vblendpd	$0x3, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	
+	vmovapd		64(%r11), %ymm8
+	vblendpd	$0x7, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	vmovapd		96(%r11), %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+		
+	sall		$3, %r10d
+//	movslq		%r10d, %r10
+	addq		%r10, %r11
+	addq		%r10, %r13
+	xorl		%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dtrmv_ut_4_lib4, .-inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_lib4, @function
+inner_store_4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_lib4; .scl 2; .type 32; .endef
+inner_store_4_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_lib4, .-inner_store_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_vs_lib4, @function
+inner_store_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_vs_lib4, .-inner_store_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_gen_lib4, @function
+inner_store_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r11d, %xmm14, %xmm14
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_gen_lib4, .-inner_store_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4          5             6          7
+// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_lib4
+	.type kernel_dgemv_n_4_lib4, @function
+kernel_dgemv_n_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_lib4
+_kernel_dgemv_n_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_lib4
+	.def kernel_dgemv_n_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_lib4, .-kernel_dgemv_n_4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7          8
+// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_vs_lib4
+	.type kernel_dgemv_n_4_vs_lib4, @function
+kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_vs_lib4
+_kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_vs_lib4
+	.def kernel_dgemv_n_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_vs_lib4, .-kernel_dgemv_n_4_vs_lib4
+#endif
+
+
+
+
+
+//                                1      2              3          4          5             6          7          8       9
+// void kernel_dgemv_n_4_gen_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_gen_lib4
+	.type kernel_dgemv_n_4_gen_lib4, @function
+kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_gen_lib4
+_kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_gen_lib4
+	.def kernel_dgemv_n_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k0 
+	movq	ARG9, %r12 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_gen_lib4, .-kernel_dgemv_n_4_gen_lib4
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_lib4
+	.type kernel_dgemv_t_4_lib4, @function
+kernel_dgemv_t_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_lib4
+_kernel_dgemv_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_lib4
+	.def kernel_dgemv_t_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_lib4, .-kernel_dgemv_t_4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_vs_lib4
+	.type kernel_dgemv_t_4_vs_lib4, @function
+kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_vs_lib4
+_kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_vs_lib4
+	.def kernel_dgemv_t_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_vs_lib4, .-kernel_dgemv_t_4_vs_lib4
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_gen_lib4
+	.type kernel_dgemv_t_4_gen_lib4, @function
+kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_gen_lib4
+_kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_gen_lib4
+	.def kernel_dgemv_t_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_gen_lib4, .-kernel_dgemv_t_4_gen_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3                   4          5          6
+// void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_ln_inv_4_lib4
+	.type kernel_dtrsv_ln_inv_4_lib4, @function
+kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_ln_inv_4_lib4
+_kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_ln_inv_4_lib4
+	.def kernel_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_ln_inv_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_ln_inv_4_lib4, .-kernel_dtrsv_ln_inv_4_lib4
+#endif
+
+
+
+
+
+//                                    1      2          3                   4          5          6          7       8
+// void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_ln_inv_4_vs_lib4
+	.type kernel_dtrsv_ln_inv_4_vs_lib4, @function
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_ln_inv_4_vs_lib4
+_kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_ln_inv_4_vs_lib4
+	.def kernel_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+	movq	%r11, %r13
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_ln_inv_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+	// store vs
+
+	movq	ARG6, %r10 // z 
+	movq	ARG7, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_ln_inv_4_vs_lib4, .-kernel_dtrsv_ln_inv_4_vs_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_4_lib4
+	.type kernel_dtrsv_lt_inv_4_lib4, @function
+kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_4_lib4
+_kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_4_lib4
+	.def kernel_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_4_lib4, .-kernel_dtrsv_lt_inv_4_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8   
+// void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_3_lib4
+	.type kernel_dtrsv_lt_inv_3_lib4, @function
+kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_3_lib4
+_kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_3_lib4
+	.def kernel_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_3_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_3_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$3, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_3_lib4, .-kernel_dtrsv_lt_inv_3_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8 
+// void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_2_lib4
+	.type kernel_dtrsv_lt_inv_2_lib4, @function
+kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_2_lib4
+_kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_2_lib4
+	.def kernel_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_2_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_2_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$2, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_2_lib4, .-kernel_dtrsv_lt_inv_2_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8 
+// void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_1_lib4
+	.type kernel_dtrsv_lt_inv_1_lib4, @function
+kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_1_lib4
+_kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_1_lib4
+	.def kernel_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_1_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_1_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$1, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_1_lib4, .-kernel_dtrsv_lt_inv_1_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi        rdx        rcx
+// void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_un_4_lib4
+	.type kernel_dtrmv_un_4_lib4, @function
+kernel_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_un_4_lib4
+_kernel_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_un_4_lib4
+	.def kernel_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dtrmv edge & dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmv_un_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG4, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_un_4_lib4, .-kernel_dtrmv_un_4_lib4
+#endif
+
+
+
+
+
+//                             rdi    rsi        rdx      rcx        r8
+// void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_ut_4_lib4
+	.type kernel_dtrmv_ut_4_lib4, @function
+kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_ut_4_lib4
+_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_ut_4_lib4
+	.def kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+	// call inner blend t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_ut_4_lib4, .-kernel_dtrmv_ut_4_lib4
+#endif
+
+
+
+
+
+//                                rdi    rsi        rdx      rcx        r8         r9
+// void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *y, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_ut_4_vs_lib4
+	.type kernel_dtrmv_ut_4_vs_lib4, @function
+kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_ut_4_vs_lib4
+_kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_ut_4_vs_lib4
+	.def kernel_dtrmv_ut_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+	// call inner blend t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z 
+	movq	ARG6, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_ut_4_vs_lib4, .-kernel_dtrmv_ut_4_vs_lib4
+#endif
+
+
+
+
+
+//                             1      2                3                4          5        6            7            8               9            10           11
+// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_4_lib4
+	.type kernel_dgemv_nt_4_lib4, @function
+kernel_dgemv_nt_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_4_lib4
+_kernel_dgemv_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_4_lib4
+	.def kernel_dgemv_nt_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_4_lib4, .-kernel_dgemv_nt_4_lib4
+#endif
+
+
+
+
+
+//                                1      2                3                4          5        6            7            8               9            10           11           12
+// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_4_vs_lib4
+	.type kernel_dgemv_nt_4_vs_lib4, @function
+kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_4_vs_lib4
+_kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_4_vs_lib4
+	.def kernel_dgemv_nt_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG12, %r11 // km
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+	movq	ARG12, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_4_vs_lib4, .-kernel_dgemv_nt_4_vs_lib4
+#endif
+
+
+
+
+
+//                            1      2              3          4        5           6
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_lib4
+	.type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_lib4
+	.def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6           7
+// void kernel_dsymv_l_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_gen_lib4
+	.type kernel_dsymv_l_4_gen_lib4, @function
+kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_gen_lib4
+_kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_gen_lib4
+	.def kernel_dsymv_l_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_gen_lib4, .-kernel_dsymv_l_4_gen_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_8_lib4.S b/kernel/avx/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..53d371e
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1575 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	0(%r15) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	64(%r15) // software prefetch
+
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	128(%r15) // software prefetch
+
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	subl	$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmovapd	32(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	192(%r15) // software prefetch
+
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	64(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+
+	vbroadcastsd	24(%r13), %ymm12
+	addq	$32, %r13 // x+4
+	vmovapd	96(%r11), %ymm8
+	addq	$128, %r11 // A0+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmovapd	96(%r15), %ymm8
+	addq	$128, %r15 // A1+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	addq	$32, %r11
+	addq	$32, %r15
+	addq	$8, %r13
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	192(%r11) // software prefetch
+
+	jl		0f // clean-up loop
+
+	movq	%r11, %r14
+	addq	%r12, %r14 // A+bs*sda
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+
+	vmovupd	0(%r13), %ymm12
+	addq	$32, %r13 // x+4
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	prefetcht0	64(%r14) // software prefetch
+
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	prefetcht0	128(%r14) // software prefetch
+
+	vmovapd	128(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r14) // software prefetch
+
+	vmovapd	192(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+	
+//	addq	%r12, %r11 // A+bs*sda
+	movq	%r14, %r11 // A+bs*sda
+	addq	%r12, %r14 // A+bs*sda+bs*sda
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	vmovapd	128(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	vmovapd	192(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k-4
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+	
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	// first 4 columns
+	vmovapd			0(%r11), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	
+	subl			$4, %r10d
+
+	vmovapd			32(%r11), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	
+	vmovapd			64(%r11), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+
+
+
+	// last 4 columns
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r15), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	
+	subl			$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd			32(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmovapd			32(%r15), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd			64(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r15), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vbroadcastsd	24(%r13), %ymm12
+	vmovapd			96(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmovapd			96(%r15), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+	
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+//                            rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_8_lib4
+	.type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_8_lib4
+	.def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+	// store
+
+	movq	ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi           rdx         rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_8_lib4
+	.type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_8_lib4
+	.def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+//                             rdi    rsi        rdx      rcx        r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_un_8_lib4
+	.type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_un_8_lib4
+	.def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dtrmv edge & dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blender n
+
+#if MACRO_LEVEL>=1
+	INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	ii = 1;
+	if(m>1)
+		{
+		tmp = pD[1+ps*0];
+		beta += tmp*tmp;
+		if(m>2)
+			{
+			tmp = pD[2+ps*0];
+			beta += tmp*tmp;
+			if(m>3)
+				{
+				tmp = pD[3+ps*0];
+				beta += tmp*tmp;
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		ii = 1;
+		if(m>1)
+			{
+			pD[1+ps*0] *= tmp;
+			if(m>2)
+				{
+				pD[2+ps*0] *= tmp;
+				if(m>3)
+					{
+					pD[3+ps*0] *= tmp;
+					}
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*0] *= tmp;
+			pD[1+ii*sdd+ps*0] *= tmp;
+			pD[2+ii*sdd+ps*0] *= tmp;
+			pD[3+ii*sdd+ps*0] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*0] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[0+ps*1];
+	w2 = pD[0+ps*2];
+	w3 = pD[0+ps*3];
+	if(m>1)
+		{
+		w1 += pD[1+ps*1] * pD[1+ps*0];
+		w2 += pD[1+ps*2] * pD[1+ps*0];
+		w3 += pD[1+ps*3] * pD[1+ps*0];
+		if(m>2)
+			{
+			w1 += pD[2+ps*1] * pD[2+ps*0];
+			w2 += pD[2+ps*2] * pD[2+ps*0];
+			w3 += pD[2+ps*3] * pD[2+ps*0];
+			if(m>3)
+				{
+				w1 += pD[3+ps*1] * pD[3+ps*0];
+				w2 += pD[3+ps*2] * pD[3+ps*0];
+				w3 += pD[3+ps*3] * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[0+ps*1] += w1;
+	pD[0+ps*2] += w2;
+	pD[0+ps*3] += w3;
+	if(m>1)
+		{
+		pD[1+ps*1] += w1 * pD[1+ps*0];
+		pD[1+ps*2] += w2 * pD[1+ps*0];
+		pD[1+ps*3] += w3 * pD[1+ps*0];
+		if(m>2)
+			{
+			pD[2+ps*1] += w1 * pD[2+ps*0];
+			pD[2+ps*2] += w2 * pD[2+ps*0];
+			pD[2+ps*3] += w3 * pD[2+ps*0];
+			if(m>3)
+				{
+				pD[3+ps*1] += w1 * pD[3+ps*0];
+				pD[3+ps*2] += w2 * pD[3+ps*0];
+				pD[3+ps*3] += w3 * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+		}
+	if(m==1)
+		return;
+	// second column
+	beta = 0.0;
+	if(m>2)
+		{
+		tmp = pD[2+ps*1];
+		beta += tmp*tmp;
+		if(m>3)
+			{
+			tmp = pD[3+ps*1];
+			beta += tmp*tmp;
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		if(m>2)
+			{
+			pD[2+ps*1] *= tmp;
+			if(m>3)
+				{
+				pD[3+ps*1] *= tmp;
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*1] *= tmp;
+			pD[1+ii*sdd+ps*1] *= tmp;
+			pD[2+ii*sdd+ps*1] *= tmp;
+			pD[3+ii*sdd+ps*1] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*1] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[1+ps*2];
+	w3 = pD[1+ps*3];
+	if(m>2)
+		{
+		w2 += pD[2+ps*2] * pD[2+ps*1];
+		w3 += pD[2+ps*3] * pD[2+ps*1];
+		if(m>3)
+			{
+			w2 += pD[3+ps*2] * pD[3+ps*1];
+			w3 += pD[3+ps*3] * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[1+ps*2] += w2;
+	pD[1+ps*3] += w3;
+	if(m>2)
+		{
+		pD[2+ps*2] += w2 * pD[2+ps*1];
+		pD[2+ps*3] += w3 * pD[2+ps*1];
+		if(m>3)
+			{
+			pD[3+ps*2] += w2 * pD[3+ps*1];
+			pD[3+ps*3] += w3 * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+		}
+	if(m==2)
+		return;
+	// third column
+	beta = 0.0;
+	if(m>3)
+		{
+		tmp = pD[3+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		if(m>3)
+			{
+			pD[3+ps*2] *= tmp;
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*2] *= tmp;
+			pD[1+ii*sdd+ps*2] *= tmp;
+			pD[2+ii*sdd+ps*2] *= tmp;
+			pD[3+ii*sdd+ps*2] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*2] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[2+ps*3];
+	if(m>3)
+		{
+		w3 += pD[3+ps*3] * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	w3 = - dD[2] * w3;
+	pD[2+ps*3] += w3;
+	if(m>3)
+		{
+		pD[3+ps*3] += w3 * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+		}
+	if(m==3)
+		return;
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*3] *= tmp;
+			pD[1+ii*sdd+ps*3] *= tmp;
+			pD[2+ii*sdd+ps*3] *= tmp;
+			pD[3+ii*sdd+ps*3] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*3] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp, w0;
+	double *pC00, *pC10, *pC01, *pC11;
+	int offset;
+	double *pD0 = pD-offD;
+	for(ii=0; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		jmax = m-ii-1;
+		jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		offset = 0;
+		jj = 0;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				tmp = pC10[0+offset];
+				beta += tmp*tmp;
+				offset += 1;
+				}
+			offset += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			tmp = pC10[1+offset];
+			beta += tmp*tmp;
+			tmp = pC10[2+offset];
+			beta += tmp*tmp;
+			tmp = pC10[3+offset];
+			beta += tmp*tmp;
+			offset += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			offset += 1;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			offset = 0;
+			jj = 0;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					pC10[0+offset] *= tmp;
+					offset += 1;
+					}
+				offset += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				pC10[0+offset] *= tmp;
+				pC10[1+offset] *= tmp;
+				pC10[2+offset] *= tmp;
+				pC10[3+offset] *= tmp;
+				offset += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				pC10[0+offset] *= tmp;
+				offset += 1;
+				}
+			pC00[0] = beta;
+			}
+		if(ii<n)
+			{
+			pC01 = pC00 + ps;
+			pC11 = pC10 + ps;
+			kmax = jmax;
+			kmax0 = jmax0;
+			jmax = n-ii-1;
+			jj = 0;
+			for( ; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ps*jj] * 1.0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+						offset += 1;
+						}
+					offset += -ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+					w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+					w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					offset += 1;
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ps*jj] += w0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+						offset += 1;
+						}
+					offset = offset-ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+					pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+					pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					offset += 1;
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 2;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[1+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pD[1+ps*0];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] = tmp;
+			tmp = pC[1+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pD[2+ps*0];
+				d1 = pD[2+ps*1];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * d0;
+				pW[0+ldw*1] += tmp * d1;
+				pW[0+ldw*2] = tmp;
+				tmp = pC[2+ps*1];
+				pW[1+ldw*0] += tmp * d0;
+				pW[1+ldw*1] += tmp * d1;
+				pW[1+ldw*2] = tmp;
+				if(m>3)
+					{
+					d0 = pD[3+ps*0];
+					d1 = pD[3+ps*1];
+					d2 = pD[3+ps*2];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * d0;
+					pW[0+ldw*1] += tmp * d1;
+					pW[0+ldw*2] += tmp * d2;
+					pW[0+ldw*3] = tmp;
+					tmp = pC[3+ps*1];
+					pW[1+ldw*0] += tmp * d0;
+					pW[1+ldw*1] += tmp * d1;
+					pW[1+ldw*2] += tmp * d2;
+					pW[1+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		pC[0+ps*1] -= pW[1+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+ps*0];
+			pW[0+ldw*1] = tmp;
+			if(m>2)
+				{
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * pD[2+ps*0];
+				pW[0+ldw*1] += tmp * pD[2+ps*1];
+				pW[0+ldw*2] = tmp;
+				if(m>3)
+					{
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * pD[3+ps*0];
+					pW[0+ldw*1] += tmp * pD[3+ps*1];
+					pW[0+ldw*2] += tmp * pD[3+ps*2];
+					pW[0+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		}
+
+	return;
+	}
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double c00, c01,
+	       c10, c11,
+	       c20, c21,
+	       c30, c31;
+	double a0, a1, a2, a3, b0, b1;
+	double tmp, d0, d1, d2, d3;
+	double *pC, *pW;
+	double pT[16];// = {};
+	int ldt = 4;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	__m256d
+		_w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+	ii = 0;
+#if 1
+	double alpha = 1.0;
+	double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<n-11; ii+=12)
+		{
+		kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+#endif
+	for( ; ii<n-7; ii+=8)
+		{
+		kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	if(ii<n)
+		{
+//		kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+		kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+		}
+#else
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		_w0 = _mm256_setzero_pd();
+		_w1 = _mm256_setzero_pd();
+		_w2 = _mm256_setzero_pd();
+		_w3 = _mm256_setzero_pd();
+		for(jj=0; jj<m-3; jj+=4)
+			{
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			_d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		// TODO mask store
+		_mm256_storeu_pd( &pW[0+ps*0], _w0 );
+		_mm256_storeu_pd( &pW[0+ps*1], _w1 );
+		_mm256_storeu_pd( &pW[0+ps*2], _w2 );
+		_mm256_storeu_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ps*0] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ps*0] += d0 * tmp;
+				pW[1+ps*0] += d1 * tmp;
+				pW[2+ps*0] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ps*0] += d0 * tmp;
+					pW[1+ps*0] += d1 * tmp;
+					pW[2+ps*0] += d2 * tmp;
+					pW[3+ps*0] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		}
+#endif
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+		_w1 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+		_w2 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+		_w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		_mm256_store_pd( &pW[0+ps*1], _w1 );
+		_mm256_store_pd( &pW[0+ps*2], _w2 );
+		_mm256_store_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		}
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		c01 = pC[0+jj*sdc+ps*1];
+		c11 = pC[1+jj*sdc+ps*1];
+		c21 = pC[2+jj*sdc+ps*1];
+		c31 = pC[3+jj*sdc+ps*1];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*1];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*1];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*1];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		b1 = pW[3+ps*1];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		pC[0+jj*sdc+ps*1] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[1+jj*sdc+ps*1] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				pC[2+jj*sdc+ps*1] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					pC[3+jj*sdc+ps*1] = c31;
+					}
+				}
+			}
+		// load
+		c00 = pC[0+jj*sdc+ps*2];
+		c10 = pC[1+jj*sdc+ps*2];
+		c20 = pC[2+jj*sdc+ps*2];
+		c30 = pC[3+jj*sdc+ps*2];
+		c01 = pC[0+jj*sdc+ps*3];
+		c11 = pC[1+jj*sdc+ps*3];
+		c21 = pC[2+jj*sdc+ps*3];
+		c31 = pC[3+jj*sdc+ps*3];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*2];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*3];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*2];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*3];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*2];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*3];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*2];
+		c30 -= b0;
+		b1 = pW[3+ps*3];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*2] = c00;
+		pC[0+jj*sdc+ps*3] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*2] = c10;
+			pC[1+jj*sdc+ps*3] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*2] = c20;
+				pC[2+jj*sdc+ps*3] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*2] = c30;
+					pC[3+jj*sdc+ps*3] = c31;
+					}
+				}
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					}
+				}
+			}
+		}
+
+#if 1
+	jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; jj<m-11; jj+=12)
+		{
+		kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+#endif
+	for(; jj<m-7; jj+=8)
+		{
+		kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+	for(; jj<m-3; jj+=4)
+		{
+		kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+		}
+	if(jj<m)
+		{
+		kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+		}
+#else
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			_c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+			_c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+			_c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+			_c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			// store
+			_mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			c01 = pC[ll+jj*sdc+ps*1];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*1];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			pC[ll+jj*sdc+ps*1] = c01;
+			// load
+			c00 = pC[ll+jj*sdc+ps*2];
+			c01 = pC[ll+jj*sdc+ps*3];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*3];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*2] = c00;
+			pC[ll+jj*sdc+ps*3] = c01;
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			}
+		}
+#endif
+
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		}
+	return;
+	}
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp;
+	double w00, w01,
+		   w10, w11,
+		   w20, w21,
+		   w30, w31;
+	__m256d
+		_a0, _b0, _t0, _w0, _w1;
+	double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+	double pT[4];
+	int ldt = 2;
+	double *pD0 = pD-offD;
+	ii = 0;
+#if 1 // rank 2
+	for(; ii<imax-1; ii+=2)
+		{
+		// first row
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		kmax = n-ii;
+		w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+			}
+		w00 = - w00*dD[ii];
+		pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+			}
+		// second row
+		pC11 = pC10+ps*1;
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ps*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC11[0+ps*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ps*jj] *= tmp;
+			}
+		// compute T
+		kmax = n-ii;
+		tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+		for(kk=2; kk<kmax; kk++)
+			tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+		pT[0+ldt*0] = - dD[ii+0];
+		pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldt*1] = - dD[ii+1];
+		// downgrade
+		kmax = n-ii;
+		jmax = m-ii-2;
+		jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		jj = 0;
+		pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+		pC20 = pC20a;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+				w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+					w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+					}
+				w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+				w00 = w00*pT[0+ldt*0];
+				pC20[0+ps*0] += w00*1.0          + w01*0.0;
+				pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+					}
+				pC20 += 1;
+				}
+			pC20 += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			//
+			_w0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _a0, _b0 );
+			_w0 = _mm256_add_pd( _w0, _t0 );
+			_w1 = _mm256_load_pd( &pC20[0+ps*1] );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w0 = _mm256_add_pd( _w0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w1 = _mm256_add_pd( _w1, _t0 );
+				}
+			//
+			_b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+			_w1 = _mm256_mul_pd( _w1, _b0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_w1 = _mm256_add_pd( _w1, _t0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+			_w0 = _mm256_mul_pd( _w0, _b0 );
+			//
+			_a0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_add_pd( _a0, _w0 );
+			_mm256_store_pd( &pC20[0+ps*0], _a0 );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_a0 = _mm256_add_pd( _a0, _t0 );
+			_a0 = _mm256_add_pd( _a0, _w1 );
+			_mm256_store_pd( &pC20[0+ps*1], _a0 );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w0, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w1, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_mm256_store_pd( &pC20[0+ps*kk], _a0 );
+				}
+			pC20 += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+			w00 = w00*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				}
+			pC20 += 1;
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		if(ii<n)
+			{
+			// compute T
+			pT[0+ldt*0] = - dD[ii+0];
+			// downgrade
+			kmax = n-ii;
+			jmax = m-ii-1;
+			jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+			jmax0 = jmax<jmax0 ? jmax : jmax0;
+			jj = 0;
+			pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+			pC10 = pC10a;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					w00 = pC10[0+ps*0];
+					for(kk=1; kk<kmax; kk++)
+						{
+						w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+						}
+					w00 = w00*pT[0+ldt*0];
+					pC10[0+ps*0] += w00;
+					for(kk=1; kk<kmax; kk++)
+						{
+						pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+						}
+					pC10 += 1;
+					}
+				pC10 += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				//
+				_w0 = _mm256_load_pd( &pC10[0+ps*0] );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _a0, _b0 );
+					_w0 = _mm256_add_pd( _w0, _t0 );
+					}
+				//
+				_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+				_w0 = _mm256_mul_pd( _w0, _b0 );
+				//
+				_a0 = _mm256_load_pd( &pC10[0+ps*0] );
+				_a0 = _mm256_add_pd( _a0, _w0 );
+				_mm256_store_pd( &pC10[0+ps*0], _a0 );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _w0, _b0 );
+					_a0 = _mm256_add_pd( _a0, _t0 );
+					_mm256_store_pd( &pC10[0+ps*kk], _a0 );
+					}
+				pC10 += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				w00 = pC10[0+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+					}
+				w00 = w00*pT[0+ldt*0];
+				pC10[0+ps*0] += w00;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+					}
+				pC10 += 1;
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+	{
+	const int ps = 4;
+	int kk;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	// 0
+	// 1
+	v10 =  pD[0+ps*1];
+	// 2
+	v10 += pD[1+ps*2]*pD[0+ps*2];
+	v20 =  pD[0+ps*2];
+	v21 =  pD[1+ps*2];
+	// 3
+	v10 += pD[1+ps*3]*pD[0+ps*3];
+	v20 += pD[2+ps*3]*pD[0+ps*3];
+	v21 += pD[2+ps*3]*pD[1+ps*3];
+	v30 =  pD[0+ps*3];
+	v31 =  pD[1+ps*3];
+	v32 =  pD[2+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		v10 += pD[1+ps*kk]*pD[0+ps*kk];
+		v20 += pD[2+ps*kk]*pD[0+ps*kk];
+		v30 += pD[3+ps*kk]*pD[0+ps*kk];
+		v21 += pD[2+ps*kk]*pD[1+ps*kk];
+		v31 += pD[3+ps*kk]*pD[1+ps*kk];
+		v32 += pD[3+ps*kk]*pD[2+ps*kk];
+		}
+	pT[0+ps*0] = - dD[0];
+	pT[1+ps*1] = - dD[1];
+	pT[2+ps*2] = - dD[2];
+	pT[3+ps*3] = - dD[3];
+	pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+	pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+	pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+	pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+	pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+	return;
+	}
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	return;
+	}
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
+
+
+
+
diff --git a/kernel/avx/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..91d1cc0
--- /dev/null
+++ b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1434 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[1] = idamax+1;
+	if(tmp0!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		inv = _mm_loaddup_pd( &pA[1+bs*1] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[1], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[1] = 0.0;
+		}
+
+
+	// third column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[2] = idamax+2;
+	if(tmp0!=0)
+		{
+		if(ipiv[2]!=2)
+			drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		inv = _mm_loaddup_pd( &pA[2+bs*2] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[2], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[2] = 0.0;
+		}
+
+
+	// fourth column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[3] = idamax+3;
+	if(tmp0!=0)
+		{
+		if(ipiv[3]!=3)
+			drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		inv = _mm_loaddup_pd( &pA[3+bs*3] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[3], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[3] = 0.0;
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
+	
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*0], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>1)
+		{
+		ipiv[1] = idamax+1;
+		if(tmp0!=0)
+			{
+			if(ipiv[1]!=1)
+				drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+			inv = _mm_loaddup_pd( &pA[1+bs*1] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[1], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[1] = 0.0;
+			}
+		}
+
+	if(n==2)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*1], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// third column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>2)
+		{
+		ipiv[2] = idamax+2;
+		if(tmp0!=0)
+			{
+			if(ipiv[2]!=2)
+				drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			inv = _mm_loaddup_pd( &pA[2+bs*2] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[2], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n==3)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*2], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// fourth column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>3)
+		{
+		ipiv[3] = idamax+3;
+		if(tmp0!=0)
+			{
+			if(ipiv[3]!=3)
+				drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			inv = _mm_loaddup_pd( &pA[3+bs*3] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[3], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[3] = 0.0;
+			}
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
diff --git a/kernel/avx/kernel_dsymv_6_lib4.S b/kernel/avx/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..b55690a
--- /dev/null
+++ b/kernel/avx/kernel_dsymv_6_lib4.S
@@ -0,0 +1,1031 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+	vmovupd	0(%r14), %ymm13
+
+	vmovapd	0(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	64(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovapd	96(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	128(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmulpd	%ymm14, %ymm10, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	160(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	vmulpd	%ymm14, %ymm11, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+	vmaskmovpd	0(%r14), %ymm14, %ymm13
+
+	vmovupd	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmaskmovpd	-32(%rsp), %ymm14
+	vmaskmovpd	0(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	32(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	64(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	96(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+		
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	128(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmulpd	%ymm14, %ymm10, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	160(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	vmulpd	%ymm14, %ymm11, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	%ymm13, %ymm14, 0(%r14)
+
+	sall	$3, %r10d
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+	vmovupd		0(%r13), %ymm12
+	vmovupd		0(%r14), %ymm13
+
+	vmovapd		0(%r11), %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		32(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		64(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmovapd		96(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	subq	$4, %r10
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd			%ymm1, %ymm0, %ymm0
+	vhaddpd			%ymm3, %ymm2, %ymm2
+	vhaddpd			%ymm5, %ymm4, %ymm4
+//	vhaddpd			%ymm3, %ymm2, %ymm2
+	vperm2f128		$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm2, %ymm0
+	vextractf128	$0x1, %ymm4, %xmm5
+	vaddpd			%ymm0, %ymm1, %ymm0
+	vaddpd			%ymm4, %ymm5, %ymm4
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm0
+	vmulpd			%ymm4, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd			0(%r12), %ymm14
+	vmovupd			32(%r12), %ymm13
+	vmulpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm0, %ymm14, %ymm0
+	vmulpd			%ymm15, %ymm13, %ymm13
+	vaddpd			%ymm1, %ymm13, %ymm1
+	
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vmovupd		0(%r11), %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %xmm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+//                             rdi    rsi              rdx              rcx        r8       r9           rsp+8        rsp+16          rsp+24       rsp_32       rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_6_lib4
+	.type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_6_lib4
+	.def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+	vbroadcastsd 32(%r10), %ymm10
+	vmulpd		%ymm15, %ymm10, %ymm10
+	vbroadcastsd 40(%r10), %ymm11
+	vmulpd		%ymm15, %ymm11, %ymm11
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_6_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+//                            rdi    rsi            rdx        rcx      r8           r9           rsp+8        rsp+16 
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_lib4
+	.type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_lib4
+	.def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx/kernel_sgead_lib8.S b/kernel/avx/kernel_sgead_lib8.S
new file mode 100644
index 0000000..4cafa0a
--- /dev/null
+++ b/kernel/avx/kernel_sgead_lib8.S
@@ -0,0 +1,3096 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_0_lib8, @function
+inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r12
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		64(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_0_gen_lib8, @function
+inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovups		0(%r12), %ymm0
+	vmaskmovps	0(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovups		32(%r12), %ymm0
+	vmaskmovps	32(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r12
+
+	vmovups		-64(%r12), %ymm0
+	vmaskmovps	64(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovups		-32(%r12), %ymm0
+	vmaskmovps	-32(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovups		0(%r12), %ymm0
+	vmaskmovps	0(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_1_lib8, @function
+inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+#if 1
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+#else
+	vmovups		4(%r12), %ymm0
+	vmovups		-28(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovups		36(%r12), %ymm0
+	vmovups		4(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovups		-60(%r12), %ymm0
+	vmovups		-92(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovups		-28(%r12), %ymm0
+	vmovups		-60(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+#endif
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_1_lib8, .-inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_1_gen_lib8, @function
+inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_1_gen_lib8, .-inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_2_lib8, @function
+inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_2_lib8, .-inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_2_gen_lib8, @function
+inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_2_gen_lib8, .-inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_3_lib8, @function
+inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_3_lib8, .-inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_3_gen_lib8, @function
+inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_3_gen_lib8, .-inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_4_lib8, @function
+inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		48(%r12), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+
+	vmovaps		-48(%r12), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %rax
+
+	vmovaps		-16(%r12), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		96(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 96(%r14)
+	addq		$128, %r14
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_4_lib8, .-inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_4_gen_lib8, @function
+inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		48(%r12), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+
+	vmovaps		-48(%r12), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %rax
+
+	vmovaps		-16(%r12), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	96(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 96(%r14)
+	addq		$128, %r14
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_4_gen_lib8, .-inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_5_lib8, @function
+inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_5_lib8, .-inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_5_gen_lib8, @function
+inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_5_gen_lib8, .-inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_6_lib8, @function
+inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_6_lib8, .-inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_6_gen_lib8, @function
+inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_6_gen_lib8, .-inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_7_lib8, @function
+inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_7_lib8, .-inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_7_gen_lib8, @function
+inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_7_gen_lib8, .-inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2             3         4
+// void kernel_sgead_8_0_lib8(int k, float *alpha, float *A, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_0_lib8
+	.type kernel_sgead_8_0_lib8, @function
+kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_0_lib8
+_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_0_lib8
+	.def kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_0_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_0_lib8, .-kernel_sgead_8_0_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx       rcx
+// void kernel_sgead_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_0_gen_lib8
+	.type kernel_sgead_8_0_gen_lib8, @function
+kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_0_gen_lib8
+_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_0_gen_lib8
+	.def kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_0_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_0_gen_lib8, .-kernel_sgead_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_1_lib8
+	.type kernel_sgead_8_1_lib8, @function
+kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_1_lib8
+_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_1_lib8
+	.def kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_1_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_1_lib8, .-kernel_sgead_8_1_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_1_gen_lib8
+	.type kernel_sgead_8_1_gen_lib8, @function
+kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_1_gen_lib8
+_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_1_gen_lib8
+	.def kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_1_gen_lib8, .-kernel_sgead_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_2_lib8
+	.type kernel_sgead_8_2_lib8, @function
+kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_2_lib8
+_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_2_lib8
+	.def kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_2_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_2_lib8, .-kernel_sgead_8_2_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_2_gen_lib8
+	.type kernel_sgead_8_2_gen_lib8, @function
+kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_2_gen_lib8
+_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_2_gen_lib8
+	.def kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_2_gen_lib8, .-kernel_sgead_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_3_lib8
+	.type kernel_sgead_8_3_lib8, @function
+kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_3_lib8
+_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_3_lib8
+	.def kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_3_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_3_lib8, .-kernel_sgead_8_3_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_3_gen_lib8
+	.type kernel_sgead_8_3_gen_lib8, @function
+kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_3_gen_lib8
+_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_3_gen_lib8
+	.def kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_3_gen_lib8, .-kernel_sgead_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_4_lib8
+	.type kernel_sgead_8_4_lib8, @function
+kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_4_lib8
+_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_4_lib8
+	.def kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_4_lib8, .-kernel_sgead_8_4_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_4_gen_lib8
+	.type kernel_sgead_8_4_gen_lib8, @function
+kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_4_gen_lib8
+_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_4_gen_lib8
+	.def kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_4_gen_lib8, .-kernel_sgead_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_5_lib8
+	.type kernel_sgead_8_5_lib8, @function
+kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_5_lib8
+_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_5_lib8
+	.def kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_5_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_5_lib8, .-kernel_sgead_8_5_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_5_gen_lib8
+	.type kernel_sgead_8_5_gen_lib8, @function
+kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_5_gen_lib8
+_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_5_gen_lib8
+	.def kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_5_gen_lib8, .-kernel_sgead_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_6_lib8
+	.type kernel_sgead_8_6_lib8, @function
+kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_6_lib8
+_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_6_lib8
+	.def kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_6_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_6_lib8, .-kernel_sgead_8_6_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_6_gen_lib8
+	.type kernel_sgead_8_6_gen_lib8, @function
+kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_6_gen_lib8
+_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_6_gen_lib8
+	.def kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_6_gen_lib8, .-kernel_sgead_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_7_lib8
+	.type kernel_sgead_8_7_lib8, @function
+kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_7_lib8
+_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_7_lib8
+	.def kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_7_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_7_lib8, .-kernel_sgead_8_7_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_7_gen_lib8
+	.type kernel_sgead_8_7_gen_lib8, @function
+kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_7_gen_lib8
+_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_7_gen_lib8
+	.def kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_7_gen_lib8, .-kernel_sgead_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgecp_lib8.S b/kernel/avx/kernel_sgecp_lib8.S
new file mode 100644
index 0000000..5cd2c00
--- /dev/null
+++ b/kernel/avx/kernel_sgecp_lib8.S
@@ -0,0 +1,2796 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_0_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_0_lib8, @function
+inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_lib8:
+#endif
+#endif
+	
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		%ymm0, 32(%r12)
+	addq		$128, %r11
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		%ymm0, 64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		%ymm0, -32(%r12)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- B
+// r13d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_0_gen_lib8, @function
+inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovups		0(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r12)
+	subl		$4, %r10d
+
+	vmovups		32(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r12)
+	addq		$128, %r11
+
+	vmovups		-64(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r12)
+	addq		$128, %r12
+
+	vmovups		-32(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r12)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovups		0(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_1_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_1_lib8, @function
+inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+#if 1
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+#else
+	vmovups		4(%r11), %ymm0
+	vmovups		-28(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovups		36(%r11), %ymm0
+	vmovups		4(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovups		-60(%r11), %ymm0
+	vmovups		-92(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovups		-28(%r11), %ymm0
+	vmovups		-60(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+#endif
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_1_gen_lib8, @function
+inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_2_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_2_lib8, @function
+inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_2_gen_lib8, @function
+inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_3_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_3_lib8, @function
+inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_3_gen_lib8, @function
+inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_4_lib8, @function
+inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		48(%r11), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+
+	vmovaps		-48(%r11), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %rax
+
+	vmovaps		-16(%r11), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 96(%r13)
+	addq		$128, %r13
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_4_gen_lib8, @function
+inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		48(%r11), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+
+	vmovaps		-48(%r11), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %rax
+
+	vmovaps		-16(%r11), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 96(%r13)
+	addq		$128, %r13
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_5_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_5_lib8, @function
+inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_5_gen_lib8, @function
+inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_6_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_6_lib8, @function
+inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_6_gen_lib8, @function
+inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_7_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_7_lib8, @function
+inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_7_gen_lib8, @function
+inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx
+// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_0_lib8
+	.type kernel_sgecp_8_0_lib8, @function
+kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_0_lib8
+_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_0_lib8
+	.def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_0_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx       rcx
+// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_0_gen_lib8
+	.type kernel_sgecp_8_0_gen_lib8, @function
+kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_0_gen_lib8
+_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_0_gen_lib8
+	.def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_0_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_1_lib8
+	.type kernel_sgecp_8_1_lib8, @function
+kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_1_lib8
+_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_1_lib8
+	.def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_1_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_1_gen_lib8
+	.type kernel_sgecp_8_1_gen_lib8, @function
+kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_1_gen_lib8
+_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_1_gen_lib8
+	.def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_2_lib8
+	.type kernel_sgecp_8_2_lib8, @function
+kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_2_lib8
+_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_2_lib8
+	.def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_2_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_2_gen_lib8
+	.type kernel_sgecp_8_2_gen_lib8, @function
+kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_2_gen_lib8
+_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_2_gen_lib8
+	.def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_3_lib8
+	.type kernel_sgecp_8_3_lib8, @function
+kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_3_lib8
+_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_3_lib8
+	.def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_3_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_3_gen_lib8
+	.type kernel_sgecp_8_3_gen_lib8, @function
+kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_3_gen_lib8
+_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_3_gen_lib8
+	.def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_4_lib8
+	.type kernel_sgecp_8_4_lib8, @function
+kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_4_lib8
+_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_4_lib8
+	.def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_4_gen_lib8
+	.type kernel_sgecp_8_4_gen_lib8, @function
+kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_4_gen_lib8
+_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_4_gen_lib8
+	.def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_5_lib8
+	.type kernel_sgecp_8_5_lib8, @function
+kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_5_lib8
+_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_5_lib8
+	.def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_5_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_5_gen_lib8
+	.type kernel_sgecp_8_5_gen_lib8, @function
+kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_5_gen_lib8
+_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_5_gen_lib8
+	.def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_6_lib8
+	.type kernel_sgecp_8_6_lib8, @function
+kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_6_lib8
+_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_6_lib8
+	.def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_6_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_6_gen_lib8
+	.type kernel_sgecp_8_6_gen_lib8, @function
+kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_6_gen_lib8
+_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_6_gen_lib8
+	.def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_7_lib8
+	.type kernel_sgecp_8_7_lib8, @function
+kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_7_lib8
+_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_7_lib8
+	.def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_7_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_7_gen_lib8
+	.type kernel_sgecp_8_7_gen_lib8, @function
+kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_7_gen_lib8
+_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_7_gen_lib8
+	.def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_16x4_lib8.S b/kernel/avx/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..5c2d6c4
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,7057 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+//  8 A0
+//  9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+	
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vbroadcastf128	0(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+//	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r13
+	addq	$32, %r15
+
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+//  8 A0
+//  9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+	
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vbroadcastf128	0(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+//	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r13
+	addq	$32, %r15
+
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+
+	cmpl	$8, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r13, %r14, 1) // software prefetch
+	prefetcht0	64(%r13, %r14, 1) // software prefetch
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	-4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	%r14, %r13
+
+	cmpl	$8, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$7, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	124(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	%r14, %r13
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm14 // B[0]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%r13), %ymm14 // B[1]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	64(%r13), %ymm14 // B[2]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm14 // B[3]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r15d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r15d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm0, %ymm0
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm4, %ymm4
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm1, %ymm1
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm5, %ymm5
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm2, %ymm2
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm3, %ymm3
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // end-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r15d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r13, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r15d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r15d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movl		$0, %r15d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r15d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	64(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r15d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	68(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm6, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vmulps		%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vmulps		%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vmulps		%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+	vmulps		%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vextractf128	$0x1, %ymm0, %xmm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm1, %xmm13
+	vpermilps		$0x55, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vpermilps		$0xaa, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilps		$0xff, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	movq	%r13, %rax // C1 <- C0
+	addq	%r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	vmovaps		0(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm4, %ymm14, %ymm4
+	vmovaps		32(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm5, %ymm14, %ymm5
+	vmovaps		64(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm6, %ymm14, %ymm6
+	vmovaps		96(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm7, %ymm14, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // C1 <- C0
+	addq	%r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // C1 <- C0
+	addq	%r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	vmovaps		0(%rax), %ymm14
+	vaddps		%ymm4, %ymm14, %ymm4
+	vmovaps		32(%rax), %ymm14
+	vaddps		%ymm5, %ymm14, %ymm5
+	vmovaps		64(%rax), %ymm14
+	vaddps		%ymm6, %ymm14, %ymm6
+	vmovaps		96(%rax), %ymm14
+	vaddps		%ymm7, %ymm14, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r15)
+	vmovaps 	%ymm5, 32(%r15)
+	vmovaps 	%ymm6, 64(%r15)
+	vmovaps 	%ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			7f // end
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			7f // end
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			7f // end
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+	jmp		0f
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute D1
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	cmpl		$2, %r15d
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%rbx)
+	jl			7f // end
+	cmpl		$3, %r15d
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%rbx)
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%rbx)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%rbx)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbp // D1
+	addq	%r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+	
+	vmovaps		32(%r10), %ymm12
+	vmovaps		64(%r10), %ymm13
+	vmovaps		96(%r10), %ymm14
+
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vblendps	$0x03, %ymm13, %ymm2, %ymm2
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+	
+	vmovaps		0(%r10), %ymm12
+	vmovaps		32(%r10), %ymm13
+	vmovaps		64(%r10), %ymm14
+	vmovaps		96(%r10), %ymm15
+
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vblendps	$0x1f, %ymm13, %ymm1, %ymm1
+	vblendps	$0x3f, %ymm14, %ymm2, %ymm2
+	vblendps	$0x7f, %ymm15, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		0(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmovaps		0(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                                rdi    rsi           rdx       rcx      r8        r9           rsp+8     rsp+16   rsp+24    rsp+32
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_lib8
+	.type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9           rsp+8        rsp+16    rsp+24   rsp+32       rsp+40    rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi           rdx       rcx      r8           r9        rsp+8    rsp+16       rsp+24    rsp+32   rsp+40    rsp+48
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8            9         10       11        12       13      14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12 // km
+	movq	ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9        rsp+8    rsp+16       rsp+24    rsp+32    rsp+40   rsp+48    rsp+56    rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_lib8
+	.type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_lib8
+	.type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                       rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32 
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                          rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32             rsp+40  rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12 // m1 
+	movq	ARG12, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                             1       2          3         4          5       6          7         8          9         10       11        12       13        14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                                1       2          3         4          5       6          7         8          9         10       11        12       13        14                 15      16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8         9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4        5            6         7        8         9        10      11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                       1      2             3         4        5            6         7        8            9         10       11      12      13      14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdd*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x4_lib8.S b/kernel/avx/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..d319a83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,6673 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	128(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			32(%r11), %ymm13 // A
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+//	vbroadcastf128	128(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	32(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	128(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			32(%r11), %ymm13 // A
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+//	vbroadcastf128	128(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	32(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %r15d
+	subl			%r14d, %r15d // 8-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r14d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r12, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r14d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r14d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movl		$0, %r14d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r14d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r14d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm4
+	vmulps		%ymm1, %ymm15, %ymm5
+	vmulps		%ymm2, %ymm15, %ymm6
+	vmulps		%ymm3, %ymm15, %ymm7
+
+	// transpose
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm0
+	vblendps	$0xaa, %ymm5, %ymm5, %ymm1
+	vblendps	$0xaa, %ymm6, %ymm7, %ymm2
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm3
+
+	vunpcklps	%ymm1, %ymm0, %ymm4
+	vunpckhps	%ymm1, %ymm0, %ymm5
+	vunpcklps	%ymm3, %ymm2, %ymm6
+	vunpckhps	%ymm3, %ymm2, %ymm7
+
+	vunpcklpd	%ymm5, %ymm7, %ymm2
+	vunpckhpd	%ymm5, %ymm7, %ymm3
+	vunpcklpd	%ymm6, %ymm4, %ymm0
+	vunpckhpd	%ymm6, %ymm4, %ymm1
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm0, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm1, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm2, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm3, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm4, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm5, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm6, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm7, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm4
+	vmulps		%ymm1, %ymm15, %ymm5
+	vmulps		%ymm2, %ymm15, %ymm6
+	vmulps		%ymm3, %ymm15, %ymm7
+
+	// transpose
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm0
+	vblendps	$0xaa, %ymm5, %ymm5, %ymm1
+	vblendps	$0xaa, %ymm6, %ymm7, %ymm2
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm3
+
+	vunpcklps	%ymm1, %ymm0, %ymm4
+	vunpckhps	%ymm1, %ymm0, %ymm5
+	vunpcklps	%ymm3, %ymm2, %ymm6
+	vunpckhps	%ymm3, %ymm2, %ymm7
+
+	vunpcklpd	%ymm5, %ymm7, %ymm2
+	vunpckhpd	%ymm5, %ymm7, %ymm3
+	vunpcklpd	%ymm6, %ymm4, %ymm0
+	vunpckhpd	%ymm6, %ymm4, %ymm1
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm0, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm1, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm2, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm3, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm4, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm5, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm6, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm7, %xmm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r12d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r12d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r12d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %r15 // C0
+	addq	%r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%xmm0,  0(%r10)
+	vmovaps 	%xmm1, 32(%r10)
+	vmovaps 	%xmm2, 64(%r10)
+	vmovaps 	%xmm3, 96(%r10)
+	vmovaps 	%xmm4, 128(%r10)
+	vmovaps 	%xmm5, 160(%r10)
+	vmovaps 	%xmm6, 192(%r10)
+	vmovaps 	%xmm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm12, %ymm14
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm1, %ymm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r10)
+	je			0f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%xmm14, %xmm12, %xmm14
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm1, %xmm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm2, %xmm14, 64(%r10)
+	cmpl		$4, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm3, %xmm14, 96(%r10)
+	cmpl		$5, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm4, %xmm14, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm5, %xmm14, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm6, %xmm14, 192(%r10)
+	je			0f // end
+	vmaskmovps	%xmm7, %xmm14, 224(%r10)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm12, %xmm14, %xmm14
+	vsubps		%xmm15, %xmm12, %xmm15
+	vandps		%xmm14, %xmm15, %xmm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	vmovaps		%xmm7, %xmm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	addq		$32, %r11
+
+	cmpl	$3, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	addq		$32, %r11
+
+	cmpl	$4, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	addq		$32, %r11
+
+	cmpl	$5, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	addq		$32, %r11
+
+	cmpl	$6, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm1, %xmm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm2, %xmm15, 64(%r11)
+	cmpl		$4, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm3, %xmm15, 96(%r11)
+	cmpl		$5, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm4, %xmm15, 128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm5, %xmm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm6, %xmm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%xmm7, %xmm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	32(%r10), %ymm12
+	vmovaps 	64(%r10), %ymm13
+	vmovaps 	96(%r10), %ymm14
+
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vblendps	$0x3, %ymm13, %ymm2, %ymm2
+	vblendps	$0x7, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r10)
+	//
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG3, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx        rcx         r8         r9      rsp+8        rsp+16    rsp+24
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // km
+	movq	ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  edi    rsi       rdx       rcx       r8        r9
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4            5         6        7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+//                                     1      2             3         4            5         6        7         8       9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4            5         6        7            8         9        10      11      12      13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x8_lib8.S b/kernel/avx/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..354fa83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5514 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+//	vbroadcastf128	0(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	16(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vbroadcastf128	16(%r12), %ymm14 // B
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+//	vbroadcastf128	0(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	16(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vbroadcastf128	16(%r12), %ymm14 // B
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+	prefetcht0	128(%r14) // software prefetch
+	prefetcht0	192(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	132(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	164(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	196(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	228(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	136(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	168(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	200(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	232(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	140(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	172(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	204(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	236(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	144(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	176(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	208(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	240(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	148(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	180(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	212(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	244(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	152(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	184(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	216(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	248(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	156(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	188(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	220(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	252(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r14d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	16(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	20(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	24(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	28(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	48(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	52(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	56(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	60(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	80(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	84(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	88(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	92(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vbroadcastss	112(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	116(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	120(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	124(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	16(%r11), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$6, %r12d
+	jl				0f // ret
+	vbroadcastss	148(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	152(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	156(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	20(%r11), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$7, %r12d
+	jl				0f // ret
+	vbroadcastss	184(%r10), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	188(%r10), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	24(%r11), %ymm13
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$8, %r12d
+	jl				0f // ret
+	vbroadcastss	220(%r10), %ymm13
+	vmulps			%ymm6, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	28(%r11), %ymm13
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm4, %xmm13
+//	vpermilps	$0x00, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			9f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+10:
+	vmovsd		%xmm13, 16(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm4
+	cmpl		$6, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm4, %ymm4, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm5, %xmm13
+	vpermilps	$0x55, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			11f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+12:
+	vmovsd		%xmm13, 20(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm5
+	cmpl		$7, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm5, %ymm5, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm6, %xmm13
+	vpermilps	$0xaa, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			13f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+14:
+	vmovsd		%xmm13, 24(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm6, %ymm13, %ymm6
+	cmpl		$8, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm6, %ymm6, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm7, %xmm13
+	vpermilps	$0xff, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			15f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+16:
+	vmovsd		%xmm13, 28(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm7, %ymm13, %ymm7
+
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+9:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		10b
+
+11:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		12b
+
+13:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		14b
+
+15:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		16b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r10), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r10), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r10), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r10), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r11), %ymm12
+	vaddps		%ymm4, %ymm12, %ymm4
+	vmovaps		160(%r11), %ymm12
+	vaddps		%ymm5, %ymm12, %ymm5
+	vmovaps		192(%r11), %ymm12
+	vaddps		%ymm6, %ymm12, %ymm6
+	vmovaps		224(%r11), %ymm12
+	vaddps		%ymm7, %ymm12, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps		32(%r10), %ymm14
+	vblendps	$0x01, %ymm14, %ymm1, %ymm1
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps		64(%r10), %ymm14
+	vblendps	$0x03, %ymm14, %ymm2, %ymm2
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		96(%r10), %ymm14
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps		128(%r10), %ymm14
+	vblendps	$0x0f, %ymm14, %ymm4, %ymm4
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps		160(%r10), %ymm14
+	vblendps	$0x1f, %ymm14, %ymm5, %ymm5
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps		192(%r10), %ymm14
+	vblendps	$0x3f, %ymm14, %ymm6, %ymm6
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps		224(%r10), %ymm14
+	vblendps	$0x7f, %ymm14, %ymm7, %ymm7
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmovaps 	128(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmovaps 	160(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmovaps 	192(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmovaps 	224(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmovaps 	128(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmovaps 	160(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmovaps 	192(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmovaps 	224(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4         5            6         7         8       9
+// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx        rcx         r8         r9      rsp+8        rsp+16    rsp+24
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movl	$8, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // m1 
+	movq	ARG9, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	$8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                  edi    rsi       rdx       rcx       r8        r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$8, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_diag_lib8.c b/kernel/avx/kernel_sgemm_diag_lib8.c
new file mode 100644
index 0000000..63183b2
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_diag_lib8.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	b_33 = _mm256_broadcast_ss( &B[3] );
+	b_33 = _mm256_mul_ps( b_33, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+		_mm256_store_ps( &D[24], d_03 );
+
+		A += 8*sda;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+		_mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	b_33 = _mm256_broadcast_ss( &B[3] );
+	b_33 = _mm256_mul_ps( b_33, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+		c_00 = _mm256_load_ps( &C[24] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_03 = _mm256_add_ps( c_00, d_03 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+		_mm256_store_ps( &D[24], d_03 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+		c_00 = _mm256_load_ps( &C[24] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_03 = _mm256_add_ps( c_00, d_03 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+		_mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22,
+		c_00,
+		d_00, d_01, d_02;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11,
+		c_00,
+		d_00, d_01;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+
+		_mm256_store_ps( &D[0], d_00 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+
+		}
+	
+	}
+
+
+
+
diff --git a/kernel/avx/kernel_sgemv_4_lib8.S b/kernel/avx/kernel_sgemv_4_lib8.S
new file mode 100644
index 0000000..1508ebe
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_4_lib8.S
@@ -0,0 +1,2935 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_t_4_lib8, @function
+inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups		0(%r13), %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	subl	$8, %r10d
+
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmaskmovps	0(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmaskmovps	32(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmaskmovps	64(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmaskmovps	96(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+		
+	sall	$2, %r10d // *sizeof(float)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_t_4_lib8, .-inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_nt_4_lib8, @function
+inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_nt_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_nt_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups	0(%r13), %ymm12
+	vmovups	0(%r14), %ymm13
+
+	vmovaps	0(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm0, %ymm15, %ymm0
+	vmulps	%ymm14, %ymm6, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	subl	$8, %r10d
+
+	vmovaps	32(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm1, %ymm15, %ymm1
+	vmulps	%ymm14, %ymm7, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	vmovaps	64(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm2, %ymm15, %ymm2
+	vmulps	%ymm14, %ymm8, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+
+	vmovaps	96(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm3, %ymm15, %ymm3
+	vmulps	%ymm14, %ymm9, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	vmovups	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm11
+
+	vmaskmovps	0(%r13), %ymm11, %ymm12
+	vmaskmovps	0(%r14), %ymm11, %ymm13
+
+//	vmovups	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	0(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm0, %ymm15, %ymm0
+	vmulps	%ymm14, %ymm6, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	32(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm1, %ymm15, %ymm1
+	vmulps	%ymm14, %ymm7, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	64(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm2, %ymm15, %ymm2
+	vmulps	%ymm14, %ymm8, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	96(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm3, %ymm15, %ymm3
+	vmulps	%ymm14, %ymm9, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+		
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	%ymm13, %ymm11, 0(%r14)
+
+	sall	$2, %r10d // *sizeof(float)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_nt_4_lib8, .-inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemv_add_t_4_lib8, @function
+inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$2, %r15d // offA*sizeof(float)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2ss	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2ss	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm13, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+	vandps		%ymm15, %ymm14, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$8, %r10d // kmax - (8-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemv_add_t_4_lib8, .-inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	cmpl	$2, %r13d
+	jl		1f
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	cmpl	$3, %r13d
+	jl		2f
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	cmpl	$4, %r13d
+	jl		3f
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	cmpl	$5, %r13d
+	jl		4f
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	cmpl	$6, %r13d
+	jl		5f
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	cmpl	$7, %r13d
+	jl		6f
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	cmpl	$8, %r13d
+	jl		7f
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+	jmp		0f
+
+
+
+	vmovaps			%ymm14, %ymm12
+1:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+2:
+	vmovaps			%ymm14, %ymm12
+3:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	jmp		8f
+
+4:
+	vmovaps			%xmm14, %xmm12
+5:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+6:
+	vmovaps			%xmm14, %xmm12
+7:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+8:
+	
+	vmovaps			%xmm14, %xmm11
+	vmovaps			%xmm14, %xmm12
+	vmovaps			%xmm14, %xmm13
+
+0:
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	cmpl	$8, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$7, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$6, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$5, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$4, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$3, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$2, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$1, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+0:
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_symv_add_nt_4l_lib8, @function
+inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_symv_add_nt_4l_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4l_lib8:
+#endif
+#endif
+
+	movl	$8, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2ss	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm11
+
+	vmaskmovps	0(%r13), %ymm11, %ymm12
+	vmaskmovps	0(%r14), %ymm11, %ymm13
+
+	vmaskmovps	0(%r11), %ymm11, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x01, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm6, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	32(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x01, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x03, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm7, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	64(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x03, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x07, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm8, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+
+	vmaskmovps	96(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x07, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x0f, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm9, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	%ymm13, %ymm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$2, %rax // *sizeof(float)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_symv_add_nt_4l_lib8, .-inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+
+
+
+
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_symv_add_nt_4r_lib8, @function
+inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_symv_add_nt_4r_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4r_lib8:
+#endif
+#endif
+
+	movl	$4, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2ss	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+//	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%xmm14, %xmm13, %xmm11
+
+	vmaskmovps	0(%r13), %xmm11, %xmm12
+	vmaskmovps	0(%r14), %xmm11, %xmm13
+
+	vmaskmovps	0(%r11), %xmm11, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm0, %xmm15, %xmm0
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x01, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm6, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	32(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x01, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm1, %xmm15, %xmm1
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x03, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm7, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	64(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x03, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm2, %xmm15, %xmm2
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x07, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm8, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+
+	vmaskmovps	96(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x07, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm3, %xmm15, %xmm3
+//	vxorps		%xmm15, %xmm15, %xmm15
+//	vblendps	$0x0f, %xmm15, %xmm14, %xmm14
+//	vmulps		%xmm14, %xmm9, %xmm15
+//	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	%xmm13, %xmm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$2, %rax // *sizeof(float)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_symv_add_nt_4r_lib8, .-inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_4_lib8, @function
+inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// alpha
+	vbroadcastss	0(%r10), %xmm15
+	vmulps			%xmm0, %xmm15, %xmm0
+
+	// beta
+	vbroadcastss	0(%r11), %xmm15
+	vmovups			0(%r12), %xmm14
+	vmulps			%xmm15, %xmm14, %xmm14
+	vaddps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_4_lib8, .-inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib8, @function
+inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// alpha
+	vbroadcastss	0(%r10), %xmm15
+	vmulps			%xmm0, %xmm15, %xmm0
+
+	// beta
+	vmovups			0(%r11), %xmm14
+	vaddps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib8, .-inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_4_lib8, @function
+inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// beta
+	vmovups			0(%r10), %xmm14
+	vsubps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_4_lib8, .-inner_blend_t_scale_m11_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_lib8, @function
+inner_store_4_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_lib8; .scl 2; .type 32; .endef
+inner_store_4_lib8:
+#endif
+#endif
+	
+	vmovups %xmm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_lib8, .-inner_store_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_vs_lib8, @function
+inner_store_4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm14
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm14
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+//	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm15, %xmm14, %xmm15
+
+	vmaskmovps	%xmm0, %xmm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_vs_lib8, .-inner_store_4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_gen_lib8, @function
+inner_store_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+//	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+//	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm12, %xmm14, %xmm14
+	vsubps		%xmm15, %xmm12, %xmm15
+	vandps		%xmm14, %xmm15, %xmm15
+
+	vmaskmovps	%xmm0, %xmm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_gen_lib8, .-inner_store_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_sgemv_t_4_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_lib8
+	.type kernel_sgemv_t_4_lib8, @function
+kernel_sgemv_t_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_lib8
+_kernel_sgemv_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_lib8
+	.def kernel_sgemv_t_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_lib8, .-kernel_sgemv_t_4_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_sgemv_t_4_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_vs_lib8
+	.type kernel_sgemv_t_4_vs_lib8, @function
+kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_vs_lib8
+_kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_vs_lib8
+	.def kernel_sgemv_t_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_vs_lib8, .-kernel_sgemv_t_4_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_sgemv_t_4_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_gen_lib8
+	.type kernel_sgemv_t_4_gen_lib8, @function
+kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_gen_lib8
+_kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_gen_lib8
+	.def kernel_sgemv_t_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_gen_lib8, .-kernel_sgemv_t_4_gen_lib8
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3        4                   5          6          7          8      9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG8, %r12 // km
+	movq	ARG9, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+#endif
+
+
+
+
+
+//                             1      2                3                4          5        6            7            8               9            10           11
+// void kernel_sgemv_nt_4_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_nt_4_lib8
+	.type kernel_sgemv_nt_4_lib8, @function
+kernel_sgemv_nt_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_nt_4_lib8
+_kernel_sgemv_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_nt_4_lib8
+	.def kernel_sgemv_nt_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_nt_4_lib8, .-kernel_sgemv_nt_4_lib8
+#endif
+
+
+
+
+
+//                                1      2                3                4          5        6            7            8               9            10           11           12
+// void kernel_sgemv_nt_4_vs_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_nt_4_vs_lib8
+	.type kernel_sgemv_nt_4_vs_lib8, @function
+kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_nt_4_vs_lib8
+_kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_nt_4_vs_lib8
+	.def kernel_sgemv_nt_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG12, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+	movq	ARG12, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_nt_4_vs_lib8, .-kernel_sgemv_nt_4_vs_lib8
+#endif
+
+
+
+
+
+//                             1      2              3          4        5          6
+// void kernel_dsymv_l_4l_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4l_lib8
+	.type kernel_ssymv_l_4l_lib8, @function
+kernel_ssymv_l_4l_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4l_lib8
+_kernel_ssymv_l_4l_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4l_lib8
+	.def kernel_ssymv_l_4l_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+	movq	$0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4l_lib8, .-kernel_ssymv_l_4l_lib8
+#endif
+
+
+
+
+
+//                             1      2              3          4        5          6
+// void kernel_dsymv_l_4r_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4r_lib8
+	.type kernel_ssymv_l_4r_lib8, @function
+kernel_ssymv_l_4r_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4r_lib8
+_kernel_ssymv_l_4r_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4r_lib8
+	.def kernel_ssymv_l_4r_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+	movq	$0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4r_lib8, .-kernel_ssymv_l_4r_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5        6          7          8
+// void kernel_dsymv_l_4l_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4l_gen_lib8
+	.type kernel_ssymv_l_4l_gen_lib8, @function
+kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4l_gen_lib8
+_kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4l_gen_lib8
+	.def kernel_ssymv_l_4l_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG8, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4l_gen_lib8, .-kernel_ssymv_l_4l_gen_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5        6          7          8
+// void kernel_dsymv_l_4r_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4r_gen_lib8
+	.type kernel_ssymv_l_4r_gen_lib8, @function
+kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4r_gen_lib8
+_kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4r_gen_lib8
+	.def kernel_ssymv_l_4r_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG8, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4r_gen_lib8, .-kernel_ssymv_l_4r_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.float	0.5
+	.float	1.5
+	.float	2.5
+	.float	3.5
+	.float	4.5
+	.float	5.5
+	.float	6.5
+	.float	7.5
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_sgemv_8_lib8.S b/kernel/avx/kernel_sgemv_8_lib8.S
new file mode 100644
index 0000000..aafd8cb
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_8_lib8.S
@@ -0,0 +1,2837 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_n_8_lib8, @function
+inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_n_8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovaps			32(%r11), %ymm8
+	vbroadcastss	4(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm1, %ymm15, %ymm1
+	
+	vmovaps			64(%r11), %ymm8
+	vbroadcastss	8(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm2, %ymm15, %ymm2
+
+	vmovaps			96(%r11), %ymm8
+	vbroadcastss	12(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm3, %ymm15, %ymm3
+	
+	addq	$128, %r11
+	addq	$16, %r12
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm0, %ymm15, %ymm0
+	
+	addq	$32, %r11
+	addq	$4, %r12
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_t_8_lib8, @function
+inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_8_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups		0(%r13), %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	subl	$8, %r10d
+
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	
+	vmovaps		128(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	
+	vmovaps		160(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	
+	vmovaps		192(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	
+	vmovaps		224(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmaskmovps	0(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmaskmovps	32(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmaskmovps	64(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmaskmovps	96(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+		
+	vmaskmovps	128(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+		
+	vmaskmovps	160(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+		
+	vmaskmovps	192(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+		
+	vmaskmovps	224(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+		
+	sall	$2, %r10d
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemv_add_t_8_lib8, @function
+inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_8_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$2, %r15d // offA*sizeof(float)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2ss	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2ss	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm13, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+	vandps		%ymm15, %ymm14, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		128(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+
+	vmovaps		160(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+
+	vmovaps		192(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+
+	vmovaps		224(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$8, %r10d // kmax - (8-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_ln_inv_8_lib8, @function
+inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vbroadcastss	0(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x01, %ymm1, %ymm0, %ymm0
+
+	vmovaps			0(%r10), %ymm13
+	vblendps		$0x01, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	4(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x02, %ymm1, %ymm0, %ymm0
+
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	8(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x04, %ymm1, %ymm0, %ymm0
+
+	vmovaps			64(%r10), %ymm13
+	vblendps		$0x07, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	12(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x08, %ymm1, %ymm0, %ymm0
+
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xff, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	16(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x10, %ymm1, %ymm0, %ymm0
+
+	vmovaps			128(%r10), %ymm13
+	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	20(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x20, %ymm1, %ymm0, %ymm0
+
+	vmovaps			160(%r10), %ymm13
+	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	24(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x40, %ymm1, %ymm0, %ymm0
+
+	vmovaps			192(%r10), %ymm13
+	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	28(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x80, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_ln_inv_8_vs_lib8, @function
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vbroadcastss	0(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x01, %ymm1, %ymm0, %ymm0
+	vmovaps			0(%r10), %ymm13
+	vblendps		$0x01, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastss	4(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x02, %ymm1, %ymm0, %ymm0
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastss	8(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x04, %ymm1, %ymm0, %ymm0
+	vmovaps			64(%r10), %ymm13
+	vblendps		$0x07, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastss	12(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x08, %ymm1, %ymm0, %ymm0
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xff, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$5, %r12d
+	jl				0f // ret
+
+	vbroadcastss	16(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x10, %ymm1, %ymm0, %ymm0
+	vmovaps			128(%r10), %ymm13
+	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$6, %r12d
+	jl				0f // ret
+
+	vbroadcastss	20(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x20, %ymm1, %ymm0, %ymm0
+	vmovaps			160(%r10), %ymm13
+	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$7, %r12d
+	jl				0f // ret
+
+	vbroadcastss	24(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x40, %ymm1, %ymm0, %ymm0
+	vmovaps			192(%r10), %ymm13
+	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$8, %r12d
+	jl				0f // ret
+
+	vbroadcastss	28(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x80, %ymm1, %ymm0, %ymm0
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// r14  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// r14  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmovups		0(%r14), %ymm15
+	vblendvps	%ymm14, %ymm0, %ymm15, %ymm0
+
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	cmpl	$2, %r13d
+	jl		1f
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	cmpl	$3, %r13d
+	jl		2f
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	cmpl	$4, %r13d
+	jl		3f
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	cmpl	$5, %r13d
+	jl		4f
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	cmpl	$6, %r13d
+	jl		5f
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	cmpl	$7, %r13d
+	jl		6f
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	cmpl	$8, %r13d
+	jl		7f
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+	jmp		0f
+
+
+
+	vmovaps			%ymm14, %ymm12
+1:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+2:
+	vmovaps			%ymm14, %ymm12
+3:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	jmp		8f
+
+4:
+	vmovaps			%xmm14, %xmm12
+5:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+6:
+	vmovaps			%xmm14, %xmm12
+7:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+8:
+	
+	vmovaps			%xmm14, %xmm11
+	vmovaps			%xmm14, %xmm12
+	vmovaps			%xmm14, %xmm13
+
+0:
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	cmpl	$8, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	cmpl	$8, %r13d
+	jl		1f
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$7, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	cmpl	$7, %r13d
+	jl		1f
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$6, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	cmpl	$6, %r13d
+	jl		1f
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$5, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	cmpl	$5, %r13d
+	jl		1f
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$4, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	cmpl	$4, %r13d
+	jl		1f
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$3, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	cmpl	$3, %r13d
+	jl		1f
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$2, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	cmpl	$2, %r13d
+	jl		1f
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$1, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	cmpl	$1, %r13d
+	jl		1f
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+1:
+
+0:
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_8_lib8, @function
+inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vaddps			%ymm0, %ymm1, %ymm0
+	vaddps			%ymm2, %ymm3, %ymm2
+	vaddps			%ymm0, %ymm2, %ymm0
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+	vmulps			%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+	vmovups			0(%r12), %ymm14
+	vmulps			%ymm15, %ymm14, %ymm14
+	vaddps			%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_m11_8_lib8, @function
+inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vaddps	%ymm0, %ymm1, %ymm0
+	vaddps	%ymm2, %ymm3, %ymm2
+	vaddps	%ymm0, %ymm2, %ymm0
+
+	// beta
+	vmovups		0(%r10), %ymm14
+	vsubps		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_8_lib8, @function
+inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+	vhaddps			%ymm5, %ymm4, %ymm4
+	vhaddps			%ymm7, %ymm6, %ymm6
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+	vhaddps			%ymm6, %ymm4, %ymm4
+
+	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
+
+	vaddps			%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+	vmulps			%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+	vmovups			0(%r12), %ymm14
+	vmulps			%ymm15, %ymm14, %ymm14
+	vaddps			%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_8_lib8, @function
+inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+	vhaddps			%ymm5, %ymm4, %ymm4
+	vhaddps			%ymm7, %ymm6, %ymm6
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+	vhaddps			%ymm6, %ymm4, %ymm4
+
+	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
+
+	vaddps			%ymm0, %ymm1, %ymm0
+
+	// beta
+	vmovups			0(%r10), %ymm14
+	vsubps			%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_lib8, @function
+inner_store_8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_lib8; .scl 2; .type 32; .endef
+inner_store_8_lib8:
+#endif
+#endif
+	
+	vmovups %ymm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_lib8, .-inner_store_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_vs_lib8, @function
+inner_store_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm14
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm14, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_gen_lib8, @function
+inner_store_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4          5             6          7
+// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_lib8
+	.type kernel_sgemv_n_8_lib8, @function
+kernel_sgemv_n_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_lib8
+_kernel_sgemv_n_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_lib8
+	.def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7          8
+// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_vs_lib8
+	.type kernel_sgemv_n_8_vs_lib8, @function
+kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_vs_lib8
+_kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_vs_lib8
+	.def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5             6          7          8       9
+// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_gen_lib8
+	.type kernel_sgemv_n_8_gen_lib8, @function
+kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_gen_lib8
+_kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_gen_lib8
+	.def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1 
+	movq	ARG9, %r12 // k2 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_lib8
+	.type kernel_sgemv_t_8_lib8, @function
+kernel_sgemv_t_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_lib8
+_kernel_sgemv_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_lib8
+	.def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_vs_lib8
+	.type kernel_sgemv_t_8_vs_lib8, @function
+kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_vs_lib8
+_kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_vs_lib8
+	.def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_gen_lib8
+	.type kernel_sgemv_t_8_gen_lib8, @function
+kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_gen_lib8
+_kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_gen_lib8
+	.def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
+#endif
+
+
+
+
+
+//                                 1      2          3                   4          5          6
+// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_ln_inv_8_lib8
+	.type kernel_strsv_ln_inv_8_lib8, @function
+kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_ln_inv_8_lib8
+_kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_ln_inv_8_lib8
+	.def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_ln_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3                   4          5          6          7       8
+// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_ln_inv_8_vs_lib8
+	.type kernel_strsv_ln_inv_8_vs_lib8, @function
+kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_ln_inv_8_vs_lib8
+_kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_ln_inv_8_vs_lib8
+	.def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+	movq	ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_ln_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+	movq	ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
+#endif
+
+
+
+
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3        4                   5          6          7          8      9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG8, %r12 // km
+	movq	ARG9, %r13 // kn
+	movq	ARG5, %r14 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG9, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.float	0.5
+	.float	1.5
+	.float	2.5
+	.float	3.5
+	.float	4.5
+	.float	5.5
+	.float	6.5
+	.float	7.5
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgesc_lib8.S b/kernel/avx/kernel_sgesc_lib8.S
new file mode 100644
index 0000000..43ff708
--- /dev/null
+++ b/kernel/avx/kernel_sgesc_lib8.S
@@ -0,0 +1,506 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- alpha
+// r12    <- A
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGESC_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgesc_8_lib8, @function
+inner_kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r12)
+
+	vmovaps		64(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r12)
+
+	cmpl		$4, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- alpha
+// r12    <- A
+// r13d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgesc_8_gen_lib8, @function
+inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vbroadcastss	0(%r11), %ymm14
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  32(%r12)
+
+	vmovaps		64(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  -32(%r12)
+
+	cmpl		$4, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+//                          rdi    rsi           rdx
+// void kernel_sgesc_8_lib8(int k, float *alpha, float *A);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgesc_8_lib8
+	.type kernel_sgesc_8_lib8, @function
+kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgesc_8_lib8
+_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgesc_8_lib8
+	.def kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGESC_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgesc_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgesc_8_lib8, .-kernel_sgesc_8_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi           rdx       rcx
+// void kernel_sgecp_8_gen_lib8(int k, float *alpha, float *A, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgesc_8_gen_lib8
+	.type kernel_sgesc_8_gen_lib8, @function
+kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgesc_8_gen_lib8
+_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgesc_8_gen_lib8
+	.def kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgesc_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgesc_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgesc_8_gen_lib8, .-kernel_sgesc_8_gen_lib8
+#endif
+
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgetr_lib8.S b/kernel/avx/kernel_sgetr_lib8.S
new file mode 100644
index 0000000..745c42e
--- /dev/null
+++ b/kernel/avx/kernel_sgetr_lib8.S
@@ -0,0 +1,2476 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGETR_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgetr_8_lib8, @function
+inner_kernel_sgetr_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgetr_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$7, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	subl		$8, %r10d
+	addq		%r12, %r11
+
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 0(%r13)
+	vmovaps		%ymm3, 128(%r13)
+	vshufps		$0xee, %ymm10, %ymm8, %ymm0
+	vshufps		$0xee, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 32(%r13)
+	vmovaps		%ymm3, 160(%r13)
+	vshufps		$0x44, %ymm11, %ymm9, %ymm0
+	vshufps		$0x44, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 64(%r13)
+	vmovaps		%ymm3, 192(%r13)
+	vshufps		$0xee, %ymm11, %ymm9, %ymm0
+	vshufps		$0xee, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 96(%r13)
+	vmovaps		%ymm3, 224(%r13)
+
+	addq		$256, %r13
+
+	cmpl		$7, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	// 0
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm8
+	vmovaps		%ymm8, 0(%r13)
+	cmpl	$1, %r10d
+	jle		3f
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmovaps		%ymm8, 32(%r13)
+	cmpl	$2, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmovaps		%ymm8, 64(%r13)
+	cmpl	$3, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmovaps		%ymm8, 96(%r13)
+	cmpl	$4, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmovaps		%ymm8, 128(%r13)
+	cmpl	$5, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmovaps		%ymm8, 160(%r13)
+	cmpl	$6, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmovaps		%ymm8, 192(%r13)
+//	cmpl	$7, %r10d
+//	jle		3f
+	// 7
+//	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+//	vmovaps		%ymm8, 224(%r13)
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d // kleft*sizeof(float)
+	addq	%r14, %r11 // A+kleft
+	movl	%r10d, %r14d
+	sall	$5, %r14d // kleft*bs*sizeof(float)
+	addq	%r14, %r13
+	movl	$0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgetr_8_lib8, .-inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgetr_8_gen_lib8, @function
+inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgetr_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_gen_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$7, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	subl		$8, %r10d
+	addq		%r12, %r11
+
+	vmovupd		-32(%rsp), %ymm4
+
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 0(%r13)
+	vmaskmovps	%ymm3, %ymm4, 128(%r13)
+	vshufps		$0xee, %ymm10, %ymm8, %ymm0
+	vshufps		$0xee, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 32(%r13)
+	vmaskmovps	%ymm3, %ymm4, 160(%r13)
+	vshufps		$0x44, %ymm11, %ymm9, %ymm0
+	vshufps		$0x44, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 64(%r13)
+	vmaskmovps	%ymm3, %ymm4, 192(%r13)
+	vshufps		$0xee, %ymm11, %ymm9, %ymm0
+	vshufps		$0xee, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 96(%r13)
+	vmaskmovps	%ymm3, %ymm4, 224(%r13)
+
+	addq		$256, %r13
+
+	cmpl		$7, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	cmpl	$1, %r10d
+	jle		3f
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	cmpl	$2, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	cmpl	$3, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	cmpl	$4, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	cmpl	$5, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	cmpl	$6, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 192(%r13)
+//	cmpl	$7, %r10d
+//	jle		3f
+	// 7
+//	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+//	vmaskmovps	%ymm8, %ymm9, 224(%r13)
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d // kleft*sizeof(float)
+	addq	%r14, %r11 // A+kleft
+	movl	%r10d, %r14d
+	sall	$5, %r14d // kleft*bs*sizeof(float)
+	addq	%r14, %r13
+	movl	$0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgetr_8_gen_lib8, .-inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_0_gen_lib8, @function
+inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_0_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_0_gen_lib8, .-inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_1_gen_lib8, @function
+inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_1_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 192(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$224, %r13 // B+7*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_1_gen_lib8, .-inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_2_gen_lib8, @function
+inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_2_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$192, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_2_gen_lib8, .-inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_3_gen_lib8, @function
+inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_3_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$160, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_3_gen_lib8, .-inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_4_gen_lib8, @function
+inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$128, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_4_gen_lib8, .-inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_5_gen_lib8, @function
+inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_5_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$96, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_5_gen_lib8, .-inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_6_gen_lib8, @function
+inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_6_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$64, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_6_gen_lib8, .-inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_7_gen_lib8, @function
+inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_7_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	// 6
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$32, %r13 // B+6*bs*sizeof(float)
+
+//	jmp		2f
+//
+//3:
+//	movl	%r10d, %r14d
+//	sall	$2, %r14d
+//	addq	%r14, %r11 // A+k*sizeof(float)
+//	movl	%r10d, %r14d
+//	sall	$5, %r14d
+//	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_7_gen_lib8, .-inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_0_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_0_lib8
+	.type kernel_sgetr_8_0_lib8, @function
+kernel_sgetr_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_0_lib8
+_kernel_sgetr_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_0_lib8
+	.def kernel_sgetr_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+
+	// offsetA==0: no edge
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_0_lib8, .-kernel_sgetr_8_0_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_0_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_0_gen_lib8
+	.type kernel_sgetr_8_0_gen_lib8, @function
+kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_0_gen_lib8
+_kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_0_gen_lib8
+	.def kernel_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==0: edge to compute mask
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_0_gen_lib8, .-kernel_sgetr_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_1_lib8
+	.type kernel_sgetr_8_1_lib8, @function
+kernel_sgetr_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_1_lib8
+_kernel_sgetr_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_1_lib8
+	.def kernel_sgetr_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_1_lib8, .-kernel_sgetr_8_1_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_1_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_1_gen_lib8
+	.type kernel_sgetr_8_1_gen_lib8, @function
+kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_1_gen_lib8
+_kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_1_gen_lib8
+	.def kernel_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_1_gen_lib8, .-kernel_sgetr_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_2_lib8
+	.type kernel_sgetr_8_2_lib8, @function
+kernel_sgetr_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_2_lib8
+_kernel_sgetr_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_2_lib8
+	.def kernel_sgetr_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_2_lib8, .-kernel_sgetr_8_2_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_2_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_2_gen_lib8
+	.type kernel_sgetr_8_2_gen_lib8, @function
+kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_2_gen_lib8
+_kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_2_gen_lib8
+	.def kernel_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_2_gen_lib8, .-kernel_sgetr_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_3_lib8
+	.type kernel_sgetr_8_3_lib8, @function
+kernel_sgetr_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_3_lib8
+_kernel_sgetr_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_3_lib8
+	.def kernel_sgetr_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_3_lib8, .-kernel_sgetr_8_3_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_3_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_3_gen_lib8
+	.type kernel_sgetr_8_3_gen_lib8, @function
+kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_3_gen_lib8
+_kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_3_gen_lib8
+	.def kernel_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_3_gen_lib8, .-kernel_sgetr_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_4_lib8
+	.type kernel_sgetr_8_4_lib8, @function
+kernel_sgetr_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_4_lib8
+_kernel_sgetr_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_4_lib8
+	.def kernel_sgetr_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_4_lib8, .-kernel_sgetr_8_4_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_4_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_4_gen_lib8
+	.type kernel_sgetr_8_4_gen_lib8, @function
+kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_4_gen_lib8
+_kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_4_gen_lib8
+	.def kernel_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_4_gen_lib8, .-kernel_sgetr_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_5_lib8
+	.type kernel_sgetr_8_5_lib8, @function
+kernel_sgetr_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_5_lib8
+_kernel_sgetr_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_5_lib8
+	.def kernel_sgetr_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_5_lib8, .-kernel_sgetr_8_5_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_5_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_5_gen_lib8
+	.type kernel_sgetr_8_5_gen_lib8, @function
+kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_5_gen_lib8
+_kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_5_gen_lib8
+	.def kernel_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_5_gen_lib8, .-kernel_sgetr_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_6_lib8
+	.type kernel_sgetr_8_6_lib8, @function
+kernel_sgetr_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_6_lib8
+_kernel_sgetr_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_6_lib8
+	.def kernel_sgetr_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_6_lib8, .-kernel_sgetr_8_6_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_6_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_6_gen_lib8
+	.type kernel_sgetr_8_6_gen_lib8, @function
+kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_6_gen_lib8
+_kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_6_gen_lib8
+	.def kernel_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_6_gen_lib8, .-kernel_sgetr_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_7_lib8
+	.type kernel_sgetr_8_7_lib8, @function
+kernel_sgetr_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_7_lib8
+_kernel_sgetr_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_7_lib8
+	.def kernel_sgetr_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_7_lib8, .-kernel_sgetr_8_7_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_7_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_7_gen_lib8
+	.type kernel_sgetr_8_7_gen_lib8, @function
+kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_7_gen_lib8
+_kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_7_gen_lib8
+	.def kernel_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_7_gen_lib8, .-kernel_sgetr_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+