Squashed 'third_party/blasfeo/' content from commit 2a828ca

Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bd23910
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.swp
+*.s
+*.o
+*.out
+include/blasfeo_target.h
+libblasfeo.a
+libblasfeo.so
+octave-workspace
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..b7cfbf5
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,611 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of HPIPM.                                                                     #
+#                                                                                                 #
+# HPIPM -- High Performance Interior Point Method.                                                #
+# Copyright (C) 2017 by Gianluca Frison.                                                          #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+cmake_minimum_required(VERSION 2.8.11)
+
+project(blasfeo)
+
+enable_language(C ASM)
+
+# Target architecture
+#set(TARGET X64_INTEL_HASWELL)
+set(TARGET X64_INTEL_SANDY_BRIDGE CACHE STRING "Target architecture")
+#set(TARGET X64_INTEL_CORE)
+#set(TARGET X64_AMD_BULLDOZER)
+#set(TARGET ARMV8A_ARM_CORTEX_A57)
+#set(TARGET ARMV7A_ARM_CORTEX_A15)
+#set(TARGET GENERIC)
+
+# Linear Algebra library
+set(LA HIGH_PERFORMANCE CACHE STRING "Linear algebra optimization level")
+#set(LA REFERENCE)
+#set(LA BLAS)
+
+# BLAS and LAPACK version (for LA=BLAS in BLASFEO)
+set(REF_BLAS 0 CACHE STRING "Reference blas to use")
+#set(REF_BLAS OPENBLAS)
+#set(REF_BLAS NETLIB)
+#set(REF_BLAS MKL)
+#set(REF_BLAS BLIS)
+#set(REF_BLAS ATLAS)
+
+# Compile auxiliary functions with external dependencies (for memory allocation and printing)
+set(EXT_DEP ON CACHE BOOL "Compile external dependencies in BLASFEO")
+
+configure_file(${PROJECT_SOURCE_DIR}/blasfeo_target.h.in
+	${CMAKE_CURRENT_SOURCE_DIR}/include/blasfeo_target.h @ONLY)
+
+# C Compiler
+# set(CC_COMPILER gcc CACHE STRING "compiler")
+#set(CC_COMPILER clang)
+#set(CC_COMPILER x86_64-w64-mingw32-gcc)
+
+# build shared library
+#set(BUILD_SHARED_LIBS ON CACHE STRING "Build shared libraries")
+
+# installation directory
+if(CMAKE_INSTALL_PREFIX MATCHES "/usr/local")
+	set(CMAKE_INSTALL_PREFIX "/opt/blasfeo")
+endif()
+
+# headers installation directory
+set(BLASFEO_HEADERS_INSTALLATION_DIRECTORY "include" CACHE STRING "Headers local installation directory")
+
+# Macro level (code size vs performance in assembly kernels): 0 (no macro), 1 (all macro but gemm kernel), 2 (all macro)
+set(MACRO_LEVEL 0)
+
+# enable runtine checks
+set(RUNTIME_CHECKS 0)
+#set(RUNTIME_CHECKS 0)
+
+# compiler flags
+if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+	set(CMAKE_C_FLAGS "")
+	set(CMAKE_ASM_FLAGS "")
+	set(CMAKE_C_FLAGS_RELEASE "")
+	set(CMAKE_ASM_FLAGS_RELEASE "")
+	# optimization flags
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+	# debugging flags
+	#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g")
+	#set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g")
+endif()
+
+# search directories
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${BLASFEO_PATH}/include") XXX
+
+#
+if(${LA} MATCHES HIGH_PERFORMANCE)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_HIGH_PERFORMANCE")
+endif()
+if(${LA} MATCHES REFERENCE)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_REFERENCE")
+endif()
+if(${LA} MATCHES BLAS)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DLA_BLAS")
+endif()
+
+#
+if(${RUNTIME_CHECKS} MATCHES 1)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDIM_CHECK")
+endif()
+
+#
+if(${EXT_DEP})
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DEXT_DEP")
+endif()
+
+#
+if(${MACRO_LEVEL} MATCHES 1)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMACRO_LEVEL=1")
+endif()
+if(${MACRO_LEVEL} MATCHES 2)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMACRO_LEVEL=2")
+endif()
+
+#
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_LINUX")
+	set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_LINUX")
+endif()
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_MAC")
+	set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_MAC")
+endif()
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DOS_WINDOWS")
+	set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DOS_WINDOWS")
+endif()
+
+#
+if(${REF_BLAS} MATCHES 0)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ")
+endif(${REF_BLAS} MATCHES 0)
+if(${REF_BLAS} MATCHES OPENBLAS)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_OPENBLAS -I/opt/openblas/include")
+endif(${REF_BLAS} MATCHES OPENBLAS)
+if(${REF_BLAS} MATCHES BLIS)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_BLIS -std=c99")
+endif(${REF_BLAS} MATCHES BLIS)
+if(${REF_BLAS} MATCHES NETLIB)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_NETLIB")
+endif(${REF_BLAS} MATCHES NETLIB)
+if(${REF_BLAS} MATCHES MKL)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_MKL -m64 -I/opt/intel/mkl/include")
+endif(${REF_BLAS} MATCHES MKL)
+if(${REF_BLAS} MATCHES ATLAS)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DREF_BLAS_ATLAS")
+endif(${REF_BLAS} MATCHES ATLAS)
+
+# architecture-specific flags
+if(${TARGET} MATCHES X64_INTEL_HASWELL)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_HASWELL")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set()
+	endif()
+endif()
+
+if(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_SANDY_BRIDGE")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -mavx")
+	endif()
+endif()
+
+if(${TARGET} MATCHES X64_INTEL_CORE)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_INTEL_CORE")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -msse3")
+	endif()
+endif()
+
+if(${TARGET} MATCHES X64_AMD_BULLDOZER)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_X64_AMD_BULLDOZER")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -m64 -mavx -mfma")
+	endif()
+endif()
+
+if(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV8A_ARM_CORTEX_A57")
+	set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV8A_ARM_CORTEX_A57")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto+fp+simd")
+	endif()
+endif()
+
+if(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV7A_ARM_CORTEX_A15")
+	set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -DTARGET_ARMV7A_ARM_CORTEX_A15")
+	if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15")
+		set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon-vfpv4")
+	endif()
+endif()
+
+if(${TARGET} MATCHES GENERIC)
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTARGET_GENERIC")
+endif()
+
+
+
+# source files
+
+if(${LA} MATCHES HIGH_PERFORMANCE)
+
+	if(${TARGET} MATCHES X64_INTEL_HASWELL)
+
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/avx2/kernel_dgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib8.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib48.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_12x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_8x8_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_8x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgemv_8_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dsymv_6_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgebp_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_dgelqf_4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_24x4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_16x4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_8x8_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx2/kernel_sgemm_8x4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_diag_lib8.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgead_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgecp_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgetr_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgesc_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_8_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib8.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib8.c
+			)
+
+	endif(${TARGET} MATCHES X64_INTEL_HASWELL)
+
+	if(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/avx/kernel_dgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib8.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib48.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_8x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_12_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_8_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgemv_4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dsymv_6_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgebp_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_dgelqf_4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_16x4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_8x8_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_8x4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemm_diag_lib8.c
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgead_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgecp_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgetr_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgesc_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_8_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/avx/kernel_sgemv_4_lib8.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib8.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib8.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib8.c
+			)
+
+	endif(${TARGET} MATCHES X64_INTEL_SANDY_BRIDGE)
+
+	if(${TARGET} MATCHES X64_INTEL_CORE)
+	
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c 
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/sse3/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+			)
+
+	endif(${TARGET} MATCHES X64_INTEL_CORE)
+
+	if(${TARGET} MATCHES X64_AMD_BULLDOZER)
+	
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c 
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/fma/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+			)
+
+	endif(${TARGET} MATCHES X64_AMD_BULLDOZER)
+
+	if(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+	
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c 
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_dgemm_8x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_16x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_12x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_8x8_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_8x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv8a/kernel_sgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+			)
+
+	endif(${TARGET} MATCHES ARMV8A_ARM_CORTEX_A57)
+
+	if(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+	
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c 
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_dgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_12x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_8x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/armv7a/kernel_sgemm_4x4_lib4.S
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+			)
+
+	endif(${TARGET} MATCHES ARMV7A_ARM_CORTEX_A15)
+
+	if(${TARGET} MATCHES GENERIC)
+	
+		file(GLOB AUX_SRC
+			${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgecp_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_dgetr_lib4.c 
+			${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/c99/kernel_sgetr_lib4.c
+			${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib44.c
+			)
+
+		file(GLOB KERNEL_SRC
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dsymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_dgeqrf_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_4x4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemm_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgemv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_ssymv_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
+			${PROJECT_SOURCE_DIR}/kernel/c99/kernel_sgecp_lib4.c
+			)
+
+		file(GLOB BLAS_SRC
+			${PROJECT_SOURCE_DIR}/blas/d_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/d_lapack_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas1_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib4.c
+			${PROJECT_SOURCE_DIR}/blas/s_lapack_lib4.c
+			)
+
+	endif(${TARGET} MATCHES GENERIC)
+
+else(${LA} MATCHES HIGH_PERFORMANCE) # REFERENCE BLAS
+
+	file(GLOB AUX_SRC
+		${PROJECT_SOURCE_DIR}/auxiliary/d_aux_lib.c
+		${PROJECT_SOURCE_DIR}/auxiliary/s_aux_lib.c
+		${PROJECT_SOURCE_DIR}/auxiliary/m_aux_lib.c
+		)
+
+	file(GLOB BLAS_SRC
+		${PROJECT_SOURCE_DIR}/blas/d_blas1_lib.c
+		${PROJECT_SOURCE_DIR}/blas/d_blas2_lib.c
+		${PROJECT_SOURCE_DIR}/blas/d_blas2_diag_lib.c
+		${PROJECT_SOURCE_DIR}/blas/d_blas3_lib.c
+		${PROJECT_SOURCE_DIR}/blas/d_blas3_diag_lib.c
+		${PROJECT_SOURCE_DIR}/blas/d_lapack_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_blas1_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_blas2_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_blas2_diag_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_blas3_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_blas3_diag_lib.c
+		${PROJECT_SOURCE_DIR}/blas/s_lapack_lib.c
+		)
+
+endif(${LA} MATCHES HIGH_PERFORMANCE)
+
+if(${EXT_DEP})
+
+	file(GLOB EXT_SRC
+		${PROJECT_SOURCE_DIR}/auxiliary/d_aux_ext_dep_lib.c
+		${PROJECT_SOURCE_DIR}/auxiliary/s_aux_ext_dep_lib.c
+		${PROJECT_SOURCE_DIR}/auxiliary/v_aux_ext_dep_lib.c
+		${PROJECT_SOURCE_DIR}/auxiliary/i_aux_ext_dep_lib.c
+		)
+
+endif()
+
+set(BLASFEO_SRC ${AUX_SRC} ${KERNEL_SRC} ${BLAS_SRC} ${EXT_SRC})
+
+# add library
+add_library(blasfeo ${BLASFEO_SRC})
+target_include_directories(blasfeo
+	PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+
+install(TARGETS blasfeo EXPORT blasfeoConfig
+	LIBRARY DESTINATION lib
+	ARCHIVE DESTINATION lib
+	RUNTIME DESTINATION bin)
+
+install(EXPORT blasfeoConfig DESTINATION cmake)
+
+file(GLOB_RECURSE BLASFEO_HEADERS "include/*.h")
+install(FILES ${BLASFEO_HEADERS} DESTINATION ${BLASFEO_HEADERS_INSTALLATION_DIRECTORY})
+
+# test problems
+# add_subdirectory(test_problems)
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..5ab7695
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b7a438f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,257 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ./Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/avx/kernel_dgecp_lib4.o ./auxiliary/avx2/kernel_dgetr_lib4.o
+OBJS += ./auxiliary/s_aux_lib8.o
+OBJS += ./auxiliary/m_aux_lib48.o
+# kernels
+OBJS += ./kernel/avx2/kernel_dgemm_12x4_lib4.o ./kernel/avx2/kernel_dgemm_8x8_lib4.o ./kernel/avx2/kernel_dgemm_8x4_lib4.o ./kernel/avx2/kernel_dgemm_4x4_lib4.o ./kernel/avx/kernel_dgemm_diag_lib4.o ./kernel/avx2/kernel_dgemv_8_lib4.o ./kernel/avx/kernel_dgemv_4_lib4.o ./kernel/avx2/kernel_dsymv_6_lib4.o ./kernel/avx2/kernel_dgetrf_pivot_4_lib4.o ./kernel/avx/kernel_dgeqrf_4_lib4.o kernel/avx2/kernel_dgebp_lib4.o kernel/avx2/kernel_dgelqf_4_lib4.o
+OBJS += ./kernel/avx2/kernel_sgemm_24x4_lib8.o ./kernel/avx2/kernel_sgemm_16x4_lib8.o ./kernel/avx2/kernel_sgemm_8x8_lib8.o ./kernel/avx2/kernel_sgemm_8x4_lib8.o ./kernel/avx/kernel_sgemm_diag_lib8.o ./kernel/avx/kernel_sgecp_lib8.o ./kernel/avx/kernel_sgetr_lib8.o ./kernel/avx/kernel_sgead_lib8.o ./kernel/avx/kernel_sgesc_lib8.o ./kernel/avx/kernel_sgemv_8_lib8.o ./kernel/avx/kernel_sgemv_4_lib8.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib8.o ./blas/s_blas2_lib8.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib8.o ./blas/s_blas3_diag_lib8.o ./blas/s_lapack_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/avx/kernel_dgecp_lib4.o ./auxiliary/avx/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib8.o
+OBJS += ./auxiliary/m_aux_lib48.o
+# kernels
+OBJS += ./kernel/avx/kernel_dgemm_8x4_lib4.o ./kernel/avx/kernel_dgemm_4x4_lib4.o ./kernel/avx/kernel_dgemm_diag_lib4.o ./kernel/avx/kernel_dgemv_12_lib4.o ./kernel/avx/kernel_dgemv_8_lib4.o ./kernel/avx/kernel_dgemv_4_lib4.o ./kernel/avx/kernel_dsymv_6_lib4.o ./kernel/avx/kernel_dgetrf_pivot_4_lib4.o ./kernel/avx/kernel_dgeqrf_4_lib4.o kernel/avx/kernel_dgebp_lib4.o
+OBJS += ./kernel/avx/kernel_sgemm_16x4_lib8.o ./kernel/avx/kernel_sgemm_8x8_lib8.o ./kernel/avx/kernel_sgemm_8x4_lib8.o ./kernel/avx/kernel_sgecp_lib8.o ./kernel/avx/kernel_sgemm_diag_lib8.o ./kernel/avx/kernel_sgetr_lib8.o ./kernel/avx/kernel_sgead_lib8.o ./kernel/avx/kernel_sgesc_lib8.o ./kernel/avx/kernel_sgemv_8_lib8.o ./kernel/avx/kernel_sgemv_4_lib8.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib8.o ./blas/s_blas2_lib8.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib8.o ./blas/s_blas3_diag_lib8.o ./blas/s_lapack_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o 
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/sse3/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o 
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/fma/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o 
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/armv8a/kernel_dgemm_8x4_lib4.o ./kernel/armv8a/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/armv8a/kernel_sgemm_16x4_lib4.o ./kernel/armv8a/kernel_sgemm_12x4_lib4.o ./kernel/armv8a/kernel_sgemm_8x8_lib4.o ./kernel/armv8a/kernel_sgemm_8x4_lib4.o ./kernel/armv8a/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o 
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/armv7a/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/armv7a/kernel_sgemm_12x4_lib4.o ./kernel/armv7a/kernel_sgemm_8x4_lib4.o ./kernel/armv7a/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+# aux
+OBJS += ./auxiliary/d_aux_lib4.o ./auxiliary/c99/kernel_dgecp_lib4.o ./auxiliary/c99/kernel_dgetr_lib4.o 
+OBJS += ./auxiliary/s_aux_lib4.o ./auxiliary/c99/kernel_sgetr_lib4.o 
+OBJS += ./auxiliary/m_aux_lib44.o
+# kernels
+OBJS += ./kernel/c99/kernel_dgemm_4x4_lib4.o ./kernel/c99/kernel_dgemm_diag_lib4.o ./kernel/c99/kernel_dgemv_4_lib4.o ./kernel/c99/kernel_dsymv_4_lib4.o ./kernel/c99/kernel_dgetrf_pivot_4_lib4.o ./kernel/c99/kernel_dgeqrf_4_lib4.o
+OBJS += ./kernel/c99/kernel_sgemm_4x4_lib4.o ./kernel/c99/kernel_sgemm_diag_lib4.o ./kernel/c99/kernel_sgemv_4_lib4.o ./kernel/c99/kernel_ssymv_4_lib4.o ./kernel/c99/kernel_sgetrf_pivot_4_lib4.o ./kernel/c99/kernel_sgecp_lib4.o
+# blas
+OBJS += ./blas/d_blas1_lib4.o ./blas/d_blas2_lib4.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib4.o ./blas/d_blas3_diag_lib4.o ./blas/d_lapack_lib4.o
+OBJS += ./blas/s_blas1_lib4.o ./blas/s_blas2_lib4.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib4.o ./blas/s_blas3_diag_lib4.o ./blas/s_lapack_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+# aux
+OBJS += ./auxiliary/d_aux_lib.o
+OBJS += ./auxiliary/s_aux_lib.o
+OBJS += ./auxiliary/m_aux_lib.o
+# blas
+OBJS += ./blas/d_blas1_lib.o ./blas/d_blas2_lib.o ./blas/d_blas2_diag_lib.o ./blas/d_blas3_lib.o ./blas/d_blas3_diag_lib.o ./blas/d_lapack_lib.o
+OBJS += ./blas/s_blas1_lib.o ./blas/s_blas2_lib.o ./blas/s_blas2_diag_lib.o ./blas/s_blas3_lib.o ./blas/s_blas3_diag_lib.o ./blas/s_lapack_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+# ext dep
+OBJS += ./auxiliary/d_aux_ext_dep_lib.o
+OBJS += ./auxiliary/s_aux_ext_dep_lib.o
+OBJS += ./auxiliary/v_aux_ext_dep_lib.o
+OBJS += ./auxiliary/i_aux_ext_dep_lib.o
+endif
+
+
+
+all: clean static_library
+
+static_library: target
+	( cd auxiliary; $(MAKE) obj)
+	( cd kernel; $(MAKE) obj)
+	( cd blas; $(MAKE) obj)
+	ar rcs libblasfeo.a $(OBJS) 
+	cp libblasfeo.a ./lib/
+	@echo
+	@echo " libblasfeo.a static library build complete."
+	@echo
+
+shared_library: target
+	( cd auxiliary; $(MAKE) obj)
+	( cd kernel; $(MAKE) obj)
+	( cd blas; $(MAKE) obj)
+	gcc -shared -o libblasfeo.so $(OBJS)
+	cp libblasfeo.so ./lib/
+	@echo
+	@echo " libblasfeo.so shared library build complete."
+	@echo
+
+target:
+	touch ./include/blasfeo_target.h
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+	echo "#ifndef TARGET_X64_INTEL_HASWELL" > ./include/blasfeo_target.h
+	echo "#define TARGET_X64_INTEL_HASWELL" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+	echo "#ifndef TARGET_X64_INTEL_SANDY_BRIDGE" > ./include/blasfeo_target.h
+	echo "#define TARGET_X64_INTEL_SANDY_BRIDGE" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+	echo "#ifndef TARGET_X64_INTEL_CORE" > ./include/blasfeo_target.h
+	echo "#define TARGET_X64_INTEL_CORE" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+	echo "#ifndef TARGET_X64_AMD_BULLDOZER" > ./include/blasfeo_target.h
+	echo "#define TARGET_X64_AMD_BULLDOZER" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), GENERIC)
+	echo "#ifndef TARGET_GENERIC" > ./include/blasfeo_target.h
+	echo "#define TARGET_GENERIC" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+	echo "#ifndef TARGET_ARMV7A_ARM_CORTEX_A15" > ./include/blasfeo_target.h
+	echo "#define TARGET_ARMV7A_ARM_CORTEX_A15" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), HIGH_PERFORMANCE)
+	echo "#ifndef LA_HIGH_PERFORMANCE" >> ./include/blasfeo_target.h
+	echo "#define LA_HIGH_PERFORMANCE" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), BLAS)
+	echo "#ifndef LA_BLAS" >> ./include/blasfeo_target.h
+	echo "#define LA_BLAS" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(LA), REFERENCE)
+	echo "#ifndef LA_REFERENCE" >> ./include/blasfeo_target.h
+	echo "#define LA_REFERENCE" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+ifeq ($(EXT_DEP), 1)
+	echo "#ifndef EXT_DEP" >> ./include/blasfeo_target.h
+	echo "#define EXT_DEP" >> ./include/blasfeo_target.h
+	echo "#endif" >> ./include/blasfeo_target.h
+endif
+
+install_static:
+	mkdir -p $(PREFIX)/blasfeo
+	mkdir -p $(PREFIX)/blasfeo/lib
+	cp -f libblasfeo.a $(PREFIX)/blasfeo/lib/
+	mkdir -p $(PREFIX)/blasfeo/include
+	cp -f ./include/*.h $(PREFIX)/blasfeo/include/
+
+install_shared:
+	mkdir -p $(PREFIX)/blasfeo
+	mkdir -p $(PREFIX)/blasfeo/lib
+	cp -f libblasfeo.so $(PREFIX)/blasfeo/lib/
+	mkdir -p $(PREFIX)/blasfeo/include
+	cp -f ./include/*.h $(PREFIX)/blasfeo/include/
+
+test_problem:
+	cp libblasfeo.a ./test_problems/libblasfeo.a
+	make -C test_problems obj
+	@echo
+	@echo " Test problem build complete."
+	@echo
+
+run:
+	./test_problems/test.out
+
+clean:
+	rm -f libblasfeo.a
+	rm -f libblasfeo.so
+	rm -f ./lib/libblasfeo.a
+	rm -f ./lib/libblasfeo.so
+	make -C auxiliary clean
+	make -C kernel clean
+	make -C blas clean
+	make -C test_problems clean
+	make -C examples clean
+
diff --git a/Makefile.rule b/Makefile.rule
new file mode 100644
index 0000000..200721e
--- /dev/null
+++ b/Makefile.rule
@@ -0,0 +1,183 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+# Target architecture
+# X64_INTEL_HASWELL : x86_64 architecture with AVX2 and FMA ISA (64 bit OS) code optimized for Intel Haswell and Intel Skylake architectures.
+# X64_INTEL_SANDY_BRIDGE : x86_64 architecture with AVX ISA (64 bit OS) code optimized for Intel Sandy-Bridge architecture.
+# X64_INTEL_CORE : x86_64 architecture with SSE3 (64 bit OS) code optimized for Intel Core archiecture.
+# X64_AMD_BULLDOZER : x86_64 architecture with AVX and FMA ISA (64 bit OS) code optimized for AMD Bulldozer.
+# ARMV7A_ARM_CORTEX_A15 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) code optimized for ARM Cortex A15.
+# GENERIC : generic c99 code
+TARGET = X64_INTEL_HASWELL
+#TARGET = X64_INTEL_SANDY_BRIDGE
+#TARGET = X64_INTEL_CORE
+#TARGET = X64_AMD_BULLDOZER
+#TARGET = ARMV8A_ARM_CORTEX_A57
+#TARGET = ARMV7A_ARM_CORTEX_A15
+#TARGET = GENERIC
+
+# Linear Algebra library
+LA = HIGH_PERFORMANCE
+#LA = REFERENCE
+#LA = BLAS
+
+# BLAS and LAPACK version (for LA=BLAS)
+REF_BLAS = 0
+#REF_BLAS = OPENBLAS
+#REF_BLAS = NETLIB
+#REF_BLAS = MKL
+#REF_BLAS = BLIS
+#REF_BLAS = ATLAS
+
+# Compile auxiliary functions with external dependencies (for memory allocation and printing)
+#EXT_DEP = 0
+EXT_DEP = 1
+
+# Enable on-line checks for matrix and vector dimensions
+RUNTIME_CHECKS = 0
+#RUNTIME_CHECKS = 1
+
+# Operating system
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+    OS = LINUX
+endif
+ifeq ($(UNAME_S),Darwin)
+    OS = MAC
+endif
+#OS = LINUX
+#OS = MAC
+#OS = WINDOWS
+
+# C Compiler
+CC = gcc
+#CC = clang
+#CC = x86_64-w64-mingw32-gcc
+
+# Installation directory
+PREFIX = /opt
+
+# Macro level (code size vs performance in assembly kernels): 0 (no macro), 1 (all macro but gemm kernel), 2 (all macro)
+MACRO_LEVEL = 0
+
+# compiler / assembler / linker flags
+CFLAGS  = 
+ASFLAGS = 
+LDFLAGS =
+
+# Optimization flags
+CFLAGS += -O2 -fPIC
+
+# Debugging flags
+#CFLAGS  += -g #-Wall -pedantic -Wfloat-equal #-pg
+#ASFLAGS += -g
+
+# Definirions
+ifeq ($(LA), HIGH_PERFORMANCE)
+CFLAGS  += -DLA_HIGH_PERFORMANCE
+endif
+ifeq ($(LA), REFERENCE)
+CFLAGS  += -DLA_REFERENCE
+endif
+ifeq ($(LA), BLAS)
+CFLAGS  += -DLA_BLAS
+endif
+
+ifeq ($(RUNTIME_CHECKS), 1)
+CFLAGS += -DDIM_CHECK
+endif
+
+ifeq ($(EXT_DEP), 1)
+CFLAGS += -DEXT_DEP
+endif
+
+ifeq ($(MACRO_LEVEL), 1)
+ASFLAGS += -DMACRO_LEVEL=1
+endif
+ifeq ($(MACRO_LEVEL), 2)
+ASFLAGS += -DMACRO_LEVEL=2
+endif
+
+ifeq ($(OS), LINUX)
+CFLAGS  += -DOS_LINUX
+ASFLAGS += -DOS_LINUX
+endif
+ifeq ($(OS), MAC)
+CFLAGS  += -DOS_MAC
+ASFLAGS += -DOS_MAC
+endif
+ifeq ($(OS), WINDOWS)
+CFLAGS  += -DOS_WINDOWS
+ASFLAGS += -DOS_WINDOWS
+endif
+
+ifeq ($(REF_BLAS), 0)
+CFLAGS  += 
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+CFLAGS  += -DREF_BLAS_OPENBLAS -I/opt/openblas/include
+endif
+ifeq ($(REF_BLAS), BLIS)
+CFLAGS  += -DREF_BLAS_BLIS -std=c99
+endif
+ifeq ($(REF_BLAS), NETLIB)
+CFLAGS  += -DREF_BLAS_NETLIB
+endif
+ifeq ($(REF_BLAS), MKL)
+CFLAGS  += -DREF_BLAS_MKL -m64 -I/opt/intel/mkl/include
+endif
+ifeq ($(REF_BLAS), ATLAS)
+CFLAGS  += -DREF_BLAS_ATLAS
+endif
+
+# Architecture-specific flags
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+CFLAGS  += -m64 -mavx2 -mfma -DTARGET_X64_INTEL_HASWELL
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+CFLAGS  += -m64 -mavx -DTARGET_X64_INTEL_SANDY_BRIDGE
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+CFLAGS  += -m64 -msse3 -DTARGET_X64_INTEL_CORE
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+CFLAGS  += -m64 -mavx -mfma -DTARGET_X64_AMD_BULLDOZER
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+CFLAGS  += -march=armv8-a+crc+crypto+fp+simd -DTARGET_ARMV8A_ARM_CORTEX_A57
+ASFLAGS += -DTARGET_ARMV7A_ARM_CORTEX_A15
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+CFLAGS  += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15 -DTARGET_ARMV7A_ARM_CORTEX_A15
+ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A15
+endif
+ifeq ($(TARGET), GENERIC)
+CFLAGS  += -DTARGET_GENERIC
+endif
+
+
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..685a2c8
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,25 @@
+BLASFEO - BLAS For Embedded Optimization
+
+BLASFEO provides a set of linear algebra routines optimized for use in embedded optimization.
+It is for example employed in the Model Predictive Control software package HPMPC.
+
+BLASFEO provides three implementations of each linear algebra routine (LA):
+- HIGH_PERFORMANCE: a high-performance implementation hand-optimized for different computer architectures.
+- REFERENCE: a lightly-optimized version, coded entirely in C withou assumptions about the computer architecture.
+- BLAS: a wrapper to BLAS and LAPACK routines.
+
+The currently supported compter architectures (TARGET) are:
+- X64_INTEL_HASWELL: Intel Haswell architecture or newer, AVX2 and FMA ISA, 64-bit OS.
+- X64_INTEL_SANDY_BRIDGE: Intel Sandy-Bridge architecture or newer, AVX ISA, 64-bit OS.
+- X64_INTEL_CORE: Intel Core architecture or newer, SSE3 ISA, 64-bit OS.
+- X64_AMD_BULLDOZER: AMD Bulldozer architecture, AVX and FMA ISAs, 64-bit OS.
+- ARMV78_ARM_CORTEX_A57: ARMv78 architecture, VFPv4 and NEONv2 ISAs, 64-bit OS.
+- ARMV7A_ARM_CORTEX_A15: ARMv7A architecture, VFPv3 and NEON ISAs, 32-bit OS.
+- GENERIC: generic target, coded in C, giving better performance if the architecture provides more than 16 scalar FP registers (e.g. many RISC such as ARM).
+
+The optimized linear algebra kernels are currently provided for OS_LINUX (x86_64 64-bit, ARMv8A 64-bit, ARMv7A 32-bit), OS_WINDOWS (x86_64 64-bit) and OS_MAC (x86_64 64-bit).
+
+BLASFEO employes structures to describe matrices (d_strmat) and vectors (d_strvec), defined in include/blasfeo_common.h.
+The actual implementation of d_strmat and d_strvec depends on the LA and TARGET choice.
+
+More information about BLASFEO can be found in the ArXiv paper at https://arxiv.org/abs/1704.02457
diff --git a/TODOlist.txt b/TODOlist.txt
new file mode 100644
index 0000000..bba5ee0
--- /dev/null
+++ b/TODOlist.txt
@@ -0,0 +1,7 @@
+- syrk_potrf_ln_mn
+- alpha for trsm
+- kernels and _mn_ version of trmv
+- kernel dsymv dgemv_nt 4 avx
+- remove n from trmv
+- store_gen in single precision
+- clean target.h and create it also from cmake (see "file")
diff --git a/auxiliary/Makefile b/auxiliary/Makefile
new file mode 100644
index 0000000..d1242bd
--- /dev/null
+++ b/auxiliary/Makefile
@@ -0,0 +1,124 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib8.o
+OBJS += m_aux_lib48.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += d_aux_lib4.o
+OBJS += s_aux_lib4.o
+OBJS += m_aux_lib44.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_aux_lib.o
+OBJS += s_aux_lib.o
+OBJS += m_aux_lib.o
+
+endif # LA choice
+
+ifeq ($(EXT_DEP), 1)
+#ext dep
+OBJS += d_aux_ext_dep_lib.o
+OBJS += s_aux_ext_dep_lib.o
+OBJS += v_aux_ext_dep_lib.o
+OBJS += i_aux_ext_dep_lib.o 
+endif
+
+obj: $(OBJS)
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+	( cd avx2; $(MAKE) obj)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+	( cd c99; $(MAKE) obj)
+endif
+ifeq ($(TARGET), GENERIC)
+	( cd c99; $(MAKE) obj)
+endif
+
+
+clean:
+	rm -f *.o
+	make -C avx2 clean
+	make -C avx clean
+	make -C c99 clean
diff --git a/auxiliary/avx/Makefile b/auxiliary/avx/Makefile
new file mode 100644
index 0000000..84e0154
--- /dev/null
+++ b/auxiliary/avx/Makefile
@@ -0,0 +1,50 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgecp_lib4.o 
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/avx/kernel_dgecp_lib4.c b/auxiliary/avx/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..4bc8c9a
--- /dev/null
+++ b/auxiliary/avx/kernel_dgecp_lib4.c
@@ -0,0 +1,3024 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda,  double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*3], a_0 );
+
+		A0 += 16;
+		B0 += 16;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*3], a_0 );
+
+		A1 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		A0 += 4;
+		B0 += 4;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		A1 += 4;
+		B1 += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_sd( &A0[1+0*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[1+0*bs], c_0 );
+		c_0 = _mm_load_pd( &A0[2+0*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+0*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+0*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+0*bs], a_0 );
+
+		c_0 = _mm_load_pd( &A0[2+1*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+1*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+1*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+1*bs], a_0 );
+
+		c_0 = _mm_load_sd( &A0[3+2*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+2*bs], c_0 );
+		a_0 = _mm256_load_pd( &A1[0+2*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+2*bs], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+3*bs] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B1[0+3*bs], a_0 );
+
+		c_0 = _mm_load_sd( &A1[1+4*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[1+4*bs], c_0 );
+		c_0 = _mm_load_pd( &A1[2+4*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+4*bs], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+5*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+5*bs], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+6*bs] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+6*bs], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle
+
+		c_0 = _mm_load_pd( &A0[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A0[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*2], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*3], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*3], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*4], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[2+bs*5], c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*5], c_0 );
+
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		c_0 = _mm_load_sd( &A2[0+bs*6] );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_sd( &A0[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*2], c_0 );
+
+		c_0 = _mm_load_pd( &A1[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*3], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*4], c_0 );
+
+		c_0 = _mm_load_pd( &A2[0+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B1[2+bs*5], c_0 );
+
+		c_0 = _mm_load_sd( &A2[1+bs*6] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 8-wide + end 7x7 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		alpha_0,
+		a_0, a_1, a_2,
+		b_0, b_1;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	if(tri==1)
+		{
+		// 7x7 triangle 
+
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B0[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*1], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B0[3+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A1[3+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*2], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*2], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*2], c_0 );
+
+		c_0 = _mm_load_sd( &A1[3+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[0+bs*3], c_0 );
+		c_0 = _mm_load_pd( &A2[0+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*3], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*3] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*3], c_0 );
+
+		c_0 = _mm_load_pd( &A2[0+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B1[1+bs*4], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*4] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*4], c_0 );
+
+		c_0 = _mm_load_sd( &A2[1+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[2+bs*5], c_0 );
+		c_0 = _mm_load_sd( &A2[2+bs*5] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*5], c_0 );
+
+		c_0 = _mm_load_sd( &A2[2+bs*6] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B1[3+bs*6], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m256d
+		alpha_0,
+		a_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_sd( &A[1+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A[3+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_pd( &A0[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A0[3+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_sd( &A0[3+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*0], c_0 );
+
+		c_0 = _mm_load_pd( &A1[0+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_pd( &B[2+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		alpha_0,
+		a_0, a_1,
+		b_0;
+	
+	__m128d
+		c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		c_0 = _mm_load_pd( &A1[0+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_storeu_pd( &B[1+bs*0], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*0] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*0], c_0 );
+
+		c_0 = _mm_load_sd( &A1[1+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[2+bs*1], c_0 );
+		c_0 = _mm_load_sd( &A1[2+bs*1] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*1], c_0 );
+
+		c_0 = _mm_load_sd( &A1[2+bs*2] );
+		c_0 = _mm_mul_pd( _mm256_castpd256_pd128( alpha_0 ), c_0 );
+		_mm_store_sd( &B[3+bs*2], c_0 );
+		}
+
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_1 = _mm_load_sd( &A[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_1 = _mm_load_sd( &A[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_1 = _mm_load_sd( &A[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_loadu_pd( &A[1+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*1] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*2] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*3] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+		a_0 = _mm_load_sd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A1[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0, a_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*1], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*2], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		a_0 = _mm_loadu_pd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A1[1+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[2+bs*1], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		a_0 = _mm_load_sd( &A[1+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		a_0 = _mm_load_sd( &A1[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[1+bs*0], a_0 );
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		alpha_0,
+		a_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda,  double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*3], a_0 );
+
+		A0 += 16;
+		B0 += 16;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*1] );
+		c_0 = _mm256_load_pd( &B1[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*2] );
+		c_0 = _mm256_load_pd( &B1[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A1[0+bs*3] );
+		c_0 = _mm256_load_pd( &B1[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*3], a_0 );
+
+		A1 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B0[0+bs*0], a_0 );
+
+		A0 += 4;
+		B0 += 4;
+
+		a_0 = _mm256_load_pd( &A1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B1[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( a_0, c_0 );
+		_mm256_store_pd( &B1[0+bs*0], a_0 );
+
+		A1 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_2 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_shuffle_pd( a_1, a_2, 0x5 );
+		b_0 = _mm256_shuffle_pd( a_0, b_0, 0x5 );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+	double *A2 = A1 + bs*sda;
+	double *B1 = B0 + bs*sdb;
+
+	__m256d
+		a_0, a_1, a_2,
+		b_0, b_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_2 = _mm256_load_pd( &A2[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*1] );
+		c_1 = _mm256_load_pd( &B1[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*1], b_0 );
+		_mm256_store_pd( &B1[0+bs*1], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_2 = _mm256_load_pd( &A2[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*2] );
+		c_1 = _mm256_load_pd( &B1[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*2], b_0 );
+		_mm256_store_pd( &B1[0+bs*2], b_1 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_2 = _mm256_load_pd( &A2[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*3] );
+		c_1 = _mm256_load_pd( &B1[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*3], b_0 );
+		_mm256_store_pd( &B1[0+bs*3], b_1 );
+
+		A0 += 16;
+		A1 += 16;
+		A2 += 16;
+		B0 += 16;
+		B1 += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_2 = _mm256_load_pd( &A2[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_1 = _mm256_permute2f128_pd( a_1, a_2, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		b_1 = _mm256_shuffle_pd( b_1, a_2, 0x5 );
+		c_0 = _mm256_load_pd( &B0[0+bs*0] );
+		c_1 = _mm256_load_pd( &B1[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_1 = _mm256_mul_pd( alpha_0, b_1 );
+		b_0 = _mm256_add_pd ( c_0, b_0 );
+		b_1 = _mm256_add_pd ( c_1, b_1 );
+		_mm256_store_pd( &B0[0+bs*0], b_0 );
+		_mm256_store_pd( &B1[0+bs*0], b_1 );
+
+		A0 += 4;
+		A1 += 4;
+		A2 += 4;
+		B0 += 4;
+		B1 += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m256d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*1] );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*2] );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm256_load_pd( &A[0+bs*3] );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A[0+bs*0] );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		a_0 = _mm256_mul_pd( alpha_0, a_0 );
+		a_0 = _mm256_add_pd( c_0, a_0 );
+		_mm256_store_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_1 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		b_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m256d
+		a_0, a_1,
+		b_0,
+		alpha_0, c_0;
+	
+	int k;
+
+	alpha_0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*1] );
+		a_1 = _mm256_load_pd( &A1[0+bs*1] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*1] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*1], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*2] );
+		a_1 = _mm256_load_pd( &A1[0+bs*2] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*2] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*2], b_0 );
+
+		a_0 = _mm256_load_pd( &A0[0+bs*3] );
+		a_1 = _mm256_load_pd( &A1[0+bs*3] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*3] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*3], b_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm256_load_pd( &A0[0+bs*0] );
+		a_1 = _mm256_load_pd( &A1[0+bs*0] );
+		a_0 = _mm256_permute2f128_pd( a_0, a_1, 0x21 );
+		b_0 = _mm256_shuffle_pd( a_0, a_1, 0x5 );
+		c_0 = _mm256_load_pd( &B[0+bs*0] );
+		b_0 = _mm256_mul_pd( alpha_0, b_0 );
+		b_0 = _mm256_add_pd( c_0, b_0 );
+		_mm256_store_pd( &B[0+bs*0], b_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		a_1 = _mm_load_sd( &A[2+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		c_1 = _mm_load_sd( &B[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		a_1 = _mm_load_sd( &A[2+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		c_1 = _mm_load_sd( &B[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		a_1 = _mm_load_sd( &A[2+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		c_1 = _mm_load_sd( &B[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		a_1 = _mm_load_sd( &A[2+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*1] );
+		a_1 = _mm_load_sd( &A1[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		c_1 = _mm_load_sd( &B[2+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+		_mm_store_sd( &B[2+bs*1], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*2] );
+		a_1 = _mm_load_sd( &A1[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		c_1 = _mm_load_sd( &B[2+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+		_mm_store_sd( &B[2+bs*2], a_1 );
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*3] );
+		a_1 = _mm_load_sd( &A1[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		c_1 = _mm_load_sd( &B[2+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+		_mm_store_sd( &B[2+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A0[2+bs*0] );
+		a_1 = _mm_load_sd( &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		c_1 = _mm_load_sd( &B[2+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_1 = _mm_mul_sd( alpha_0, a_1 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		a_1 = _mm_add_sd( c_1, a_1 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+		_mm_store_sd( &B[2+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, a_1,
+		alpha_0, c_0, c_1;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		c_1 = _mm_loadu_pd( &B[1+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*1] );
+		c_0 = _mm_load_sd( &B[0+bs*1] );
+		c_1 = _mm_loadu_pd( &B[1+bs*1] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+		_mm_storeu_pd( &B[1+bs*1], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*2] );
+		c_0 = _mm_load_sd( &B[0+bs*2] );
+		c_1 = _mm_loadu_pd( &B[1+bs*2] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+		_mm_storeu_pd( &B[1+bs*2], a_1 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*3] );
+		c_0 = _mm_load_sd( &B[0+bs*3] );
+		c_1 = _mm_loadu_pd( &B[1+bs*3] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+		_mm_storeu_pd( &B[1+bs*3], a_1 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_1 = _mm_loadu_pd( &A1[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		c_1 = _mm_loadu_pd( &B[1+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_1 = _mm_mul_pd( alpha_0, a_1 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		a_1 = _mm_add_pd( c_1, a_1 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+		_mm_storeu_pd( &B[1+bs*0], a_1 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_loadu_pd( &A[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_loadu_pd( &A[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_loaddup_pd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*1] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*1] );
+		c_0 = _mm_loadu_pd( &B[0+bs*1] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*2] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*2] );
+		c_0 = _mm_loadu_pd( &B[0+bs*2] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A0[3+bs*3] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*3] );
+		c_0 = _mm_loadu_pd( &B[0+bs*3] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*3], a_0 );
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A0[3+bs*0] );
+		a_0 = _mm_loadh_pd( a_0, &A1[0+bs*0] );
+		c_0 = _mm_loadu_pd( &B[0+bs*0] );
+		a_0 = _mm_mul_pd( alpha_0, a_0 );
+		a_0 = _mm_add_pd( c_0, a_0 );
+		_mm_storeu_pd( &B[0+bs*0], a_0 );
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	__m128d
+		a_0, c_0, alpha_0;
+	
+	int k;
+
+	alpha_0 = _mm_load_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*1] );
+		c_0 = _mm_load_sd( &B[0+bs*1] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*1], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*2] );
+		c_0 = _mm_load_sd( &B[0+bs*2] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*2], a_0 );
+
+		a_0 = _mm_load_sd( &A[0+bs*3] );
+		c_0 = _mm_load_sd( &B[0+bs*3] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*3], a_0 );
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		a_0 = _mm_load_sd( &A[0+bs*0] );
+		c_0 = _mm_load_sd( &B[0+bs*0] );
+		a_0 = _mm_mul_sd( alpha_0, a_0 );
+		a_0 = _mm_add_sd( c_0, a_0 );
+		_mm_store_sd( &B[0+bs*0], a_0 );
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A)
+	{
+
+	int k;
+
+	__m256d 
+		a0;
+
+	a0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+		_mm256_store_pd( &A[4], a0 );
+		_mm256_store_pd( &A[8], a0 );
+		_mm256_store_pd( &A[12], a0 );
+
+		A += 16;
+
+		}	
+	for(; k<kmax; k++)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+
+		A += 4;
+
+		}
+	
+	}
+
+
+// A lower triangular
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A)
+	{
+
+	int k;
+
+	__m256d 
+		a0;
+
+	a0 = _mm256_broadcast_sd( &alpha );
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+		_mm256_store_pd( &A[4], a0 );
+		_mm256_store_pd( &A[8], a0 );
+		_mm256_store_pd( &A[12], a0 );
+
+		A += 16;
+
+		}	
+	for(; k<kmax; k++)
+		{
+
+		_mm256_store_pd( &A[0], a0 );
+
+		A += 4;
+
+		}
+	
+	// final 4x4 triangle
+	_mm256_store_pd( &A[0], a0 );
+
+	_mm_store_sd( &A[5], _mm256_castpd256_pd128( a0 ) );
+	_mm_store_pd( &A[6], _mm256_castpd256_pd128( a0 ) );
+	
+	_mm_store_pd( &A[10], _mm256_castpd256_pd128( a0 ) );
+
+	_mm_store_sd( &A[15], _mm256_castpd256_pd128( a0 ) );
+
+	}
+
+
+
diff --git a/auxiliary/avx/kernel_dgetr_lib4.c b/auxiliary/avx/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..29d095b
--- /dev/null
+++ b/auxiliary/avx/kernel_dgetr_lib4.c
@@ -0,0 +1,490 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	__m256d
+		alph,
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+
+	for( ; k<kmax-7; k+=8)
+		{
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		}
+
+	for( ; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		v0 = _mm256_mul_pd( v0, alph );
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		v2 = _mm256_mul_pd( v2, alph );
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		v1 = _mm256_mul_pd( v1, alph );
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		v3 = _mm256_mul_pd( v3, alph );
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	__m256d
+		v0, v1, v2, v3, v4, v5, v6, v7;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_load_pd( &A[0+ps*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+ps*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+ps*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+ps*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &B[0+ps*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &B[0+ps*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &B[0+ps*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &B[0+ps*3], v3 );
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/avx2/Makefile b/auxiliary/avx2/Makefile
new file mode 100644
index 0000000..463ebf5
--- /dev/null
+++ b/auxiliary/avx2/Makefile
@@ -0,0 +1,46 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/avx2/kernel_dgetr_lib4.c b/auxiliary/avx2/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..14d00ef
--- /dev/null
+++ b/auxiliary/avx2/kernel_dgetr_lib4.c
@@ -0,0 +1,756 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+
+// TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
+	{
+
+	const int bs = 4;
+	
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	__m256d
+		alph, 
+		v0, v1, v2, v3, v4, v5, v6, v7,
+		v8, v9, va, vb, vc, vd, ve, vf;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A0[0+bs*0];
+			C[0+bs*1] = alpha * A0[1+bs*0];
+			C[0+bs*2] = alpha * A0[2+bs*0];
+			C[0+bs*3] = alpha * A0[3+bs*0];
+
+			C[0+bs*4] = alpha * A1[0+bs*0];
+			C[0+bs*5] = alpha * A1[1+bs*0];
+			C[0+bs*6] = alpha * A1[2+bs*0];
+			C[0+bs*7] = alpha * A1[3+bs*0];
+
+			C  += 1;
+			A0 += bs;
+			A1 += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for(; k<kmax-7; k+=8)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A0 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
+
+		A1 += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*4], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*5], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*6], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*7], v7 );
+
+		C += sdc*bs;
+
+		}
+
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A0[0+bs*0];
+		C[0+bs*1] = alpha * A0[1+bs*0];
+		C[0+bs*2] = alpha * A0[2+bs*0];
+		C[0+bs*3] = alpha * A0[3+bs*0];
+
+		C[0+bs*4] = alpha * A1[0+bs*0];
+		C[0+bs*5] = alpha * A1[1+bs*0];
+		C[0+bs*6] = alpha * A1[2+bs*0];
+		C[0+bs*7] = alpha * A1[3+bs*0];
+
+		C  += 1;
+		A0 += bs;
+		A1 += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	__m256d
+		alph,
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+	
+	alph = _mm256_broadcast_sd( &alpha );
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+
+	for( ; k<kmax-7; k+=8)
+		{
+
+#if 1
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+#else // TODO alpha
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+#endif
+
+		}
+
+	for( ; k<kmax-3; k+=4)
+		{
+
+#if 1
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
+		
+		A += 4*bs;
+
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		v4 = _mm256_mul_pd( v4, alph );
+		_mm256_store_pd( &C[0+bs*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		v5 = _mm256_mul_pd( v5, alph );
+		_mm256_store_pd( &C[0+bs*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		v6 = _mm256_mul_pd( v6, alph );
+		_mm256_store_pd( &C[0+bs*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		v7 = _mm256_mul_pd( v7, alph );
+		_mm256_store_pd( &C[0+bs*3], v7 );
+
+		C += sdc*bs;
+
+#else
+
+		v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
+		v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+		v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
+		v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+		
+		A += bs*bs;
+
+		v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+		_mm256_store_pd( &C[0+bs*0], v0 );
+		v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+		_mm256_store_pd( &C[0+bs*2], v2 );
+		v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+		_mm256_store_pd( &C[0+bs*1], v1 );
+		v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+		_mm256_store_pd( &C[0+bs*3], v3 );
+
+		C += bs*sdc;
+
+#endif
+
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	__m256d
+		v0, v1, v2, v3, v4, v5, v6, v7;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
+		v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
+		v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
+		v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
+		
+		v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
+		_mm256_store_pd( &B[0+ps*0], v4 );
+		v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
+		_mm256_store_pd( &B[0+ps*1], v5 );
+		v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
+		_mm256_store_pd( &B[0+ps*2], v6 );
+		v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
+		_mm256_store_pd( &B[0+ps*3], v7 );
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/c99/Makefile b/auxiliary/c99/Makefile
new file mode 100644
index 0000000..6e9ea7b
--- /dev/null
+++ b/auxiliary/c99/Makefile
@@ -0,0 +1,77 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += 
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += 
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgecp_lib4.o kernel_dgetr_lib4.o
+OBJS += kernel_sgetr_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
diff --git a/auxiliary/c99/kernel_dgecp_lib4.c b/auxiliary/c99/kernel_dgecp_lib4.c
new file mode 100644
index 0000000..e883072
--- /dev/null
+++ b/auxiliary/c99/kernel_dgecp_lib4.c
@@ -0,0 +1,1261 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+		B[2+bs*1] = alpha*A[2+bs*1];
+		B[3+bs*1] = alpha*A[3+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+		B[2+bs*2] = alpha*A[2+bs*2];
+		B[3+bs*2] = alpha*A[3+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+		B[2+bs*3] = alpha*A[2+bs*3];
+		B[3+bs*3] = alpha*A[3+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+		B[3+bs*0] = alpha*A[3+bs*0];
+
+		B[2+bs*1] = alpha*A[2+bs*1];
+		B[3+bs*1] = alpha*A[3+bs*1];
+
+		B[3+bs*2] = alpha*A[3+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[1+bs*0];
+		B[1+bs*0] = alpha*A0[2+bs*0];
+		B[2+bs*0] = alpha*A0[3+bs*0];
+		B[3+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[1+bs*1];
+		B[1+bs*1] = alpha*A0[2+bs*1];
+		B[2+bs*1] = alpha*A0[3+bs*1];
+		B[3+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[1+bs*2];
+		B[1+bs*2] = alpha*A0[2+bs*2];
+		B[2+bs*2] = alpha*A0[3+bs*2];
+		B[3+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[1+bs*3];
+		B[1+bs*3] = alpha*A0[2+bs*3];
+		B[2+bs*3] = alpha*A0[3+bs*3];
+		B[3+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[1+bs*0];
+		B[1+bs*0] = alpha*A0[2+bs*0];
+		B[2+bs*0] = alpha*A0[3+bs*0];
+		B[3+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+0*bs] = alpha*A0[2+0*bs];
+		B[2+0*bs] = alpha*A0[3+0*bs];
+		B[3+0*bs] = alpha*A1[0+0*bs];
+
+		B[2+1*bs] = alpha*A0[3+1*bs];
+		B[3+1*bs] = alpha*A1[0+1*bs];
+
+		B[3+2*bs] = alpha*A1[0+2*bs];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		B[0+bs*1] = alpha*A0[2+bs*1];
+		B[1+bs*1] = alpha*A0[3+bs*1];
+		B[2+bs*1] = alpha*A1[0+bs*1];
+		B[3+bs*1] = alpha*A1[1+bs*1];
+
+		B[0+bs*2] = alpha*A0[2+bs*2];
+		B[1+bs*2] = alpha*A0[3+bs*2];
+		B[2+bs*2] = alpha*A1[0+bs*2];
+		B[3+bs*2] = alpha*A1[1+bs*2];
+
+		B[0+bs*3] = alpha*A0[2+bs*3];
+		B[1+bs*3] = alpha*A0[3+bs*3];
+		B[2+bs*3] = alpha*A1[0+bs*3];
+		B[3+bs*3] = alpha*A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle}
+
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+		B[3+bs*0] = alpha*A1[1+bs*0];
+
+		B[2+bs*1] = alpha*A1[0+bs*1];
+		B[3+bs*1] = alpha*A1[1+bs*1];
+
+		B[3+bs*2] = alpha*A1[1+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+		B[2+bs*1] = alpha*A1[1+bs*1];
+		B[3+bs*1] = alpha*A1[2+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+		B[2+bs*2] = alpha*A1[1+bs*2];
+		B[3+bs*2] = alpha*A1[2+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+		B[2+bs*3] = alpha*A1[1+bs*3];
+		B[3+bs*3] = alpha*A1[2+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 3x3 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+		B[3+bs*0] = alpha*A1[2+bs*0];
+
+		B[2+bs*1] = alpha*A1[1+bs*1];
+		B[3+bs*1] = alpha*A1[2+bs*1];
+
+		B[3+bs*2] = alpha*A1[2+bs*2];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+		B[2+bs*1] = alpha*A[2+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+		B[2+bs*2] = alpha*A[2+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+		B[2+bs*3] = alpha*A[2+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+		B[2+bs*0] = alpha*A[2+bs*0];
+
+		B[2+bs*1] = alpha*A[2+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[2+bs*1];
+		B[1+bs*1] = alpha*A0[3+bs*1];
+		B[2+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[2+bs*2];
+		B[1+bs*2] = alpha*A0[3+bs*2];
+		B[2+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[2+bs*3];
+		B[1+bs*3] = alpha*A0[3+bs*3];
+		B[2+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[2+bs*0];
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A0[3+bs*0];
+		B[2+bs*0] = alpha*A1[0+bs*0];
+
+		B[2+bs*1] = alpha*A1[0+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+		B[2+bs*1] = alpha*A1[1+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+		B[2+bs*2] = alpha*A1[1+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+		B[2+bs*3] = alpha*A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 2x2 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+		B[2+bs*0] = alpha*A1[1+bs*0];
+
+		B[2+bs*1] = alpha*A1[1+bs*1];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+		B[1+bs*1] = alpha*A[1+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+		B[1+bs*2] = alpha*A[1+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+		B[1+bs*3] = alpha*A[1+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		B[1+bs*0] = alpha*A[1+bs*0];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		B[0+bs*1] = alpha*A0[3+bs*1];
+		B[1+bs*1] = alpha*A1[0+bs*1];
+
+		B[0+bs*2] = alpha*A0[3+bs*2];
+		B[1+bs*2] = alpha*A1[0+bs*2];
+
+		B[0+bs*3] = alpha*A0[3+bs*3];
+		B[1+bs*3] = alpha*A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A0[3+bs*0];
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	if(tri==1)
+		{
+		// 1x1 triangle
+
+		B[1+bs*0] = alpha*A1[0+bs*0];
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
+	{
+
+	if(tri==1)
+		{
+		// A and C are lower triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] = alpha*A[0+bs*0];
+
+		B[0+bs*1] = alpha*A[0+bs*1];
+
+		B[0+bs*2] = alpha*A[0+bs*2];
+
+		B[0+bs*3] = alpha*A[0+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] = alpha*A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+		B[2+bs*1] += alpha * A[2+bs*1];
+		B[3+bs*1] += alpha * A[3+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+		B[2+bs*2] += alpha * A[2+bs*2];
+		B[3+bs*2] += alpha * A[3+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+		B[2+bs*3] += alpha * A[2+bs*3];
+		B[3+bs*3] += alpha * A[3+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[1+bs*1];
+		B[1+bs*1] += alpha * A0[2+bs*1];
+		B[2+bs*1] += alpha * A0[3+bs*1];
+		B[3+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[1+bs*2];
+		B[1+bs*2] += alpha * A0[2+bs*2];
+		B[2+bs*2] += alpha * A0[3+bs*2];
+		B[3+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[1+bs*3];
+		B[1+bs*3] += alpha * A0[2+bs*3];
+		B[2+bs*3] += alpha * A0[3+bs*3];
+		B[3+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		B[0+bs*1] += alpha * A0[2+bs*1];
+		B[1+bs*1] += alpha * A0[3+bs*1];
+		B[2+bs*1] += alpha * A1[0+bs*1];
+		B[3+bs*1] += alpha * A1[1+bs*1];
+
+		B[0+bs*2] += alpha * A0[2+bs*2];
+		B[1+bs*2] += alpha * A0[3+bs*2];
+		B[2+bs*2] += alpha * A1[0+bs*2];
+		B[3+bs*2] += alpha * A1[1+bs*2];
+
+		B[0+bs*3] += alpha * A0[2+bs*3];
+		B[1+bs*3] += alpha * A0[3+bs*3];
+		B[2+bs*3] += alpha * A1[0+bs*3];
+		B[3+bs*3] += alpha * A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+		B[2+bs*1] += alpha * A1[1+bs*1];
+		B[3+bs*1] += alpha * A1[2+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+		B[2+bs*2] += alpha * A1[1+bs*2];
+		B[3+bs*2] += alpha * A1[2+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+		B[2+bs*3] += alpha * A1[1+bs*3];
+		B[3+bs*3] += alpha * A1[2+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+		B[2+bs*1] += alpha * A[2+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+		B[2+bs*2] += alpha * A[2+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+		B[2+bs*3] += alpha * A[2+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[2+bs*1];
+		B[1+bs*1] += alpha * A0[3+bs*1];
+		B[2+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[2+bs*2];
+		B[1+bs*2] += alpha * A0[3+bs*2];
+		B[2+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[2+bs*3];
+		B[1+bs*3] += alpha * A0[3+bs*3];
+		B[2+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+		B[2+bs*1] += alpha * A1[1+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+		B[2+bs*2] += alpha * A1[1+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+		B[2+bs*3] += alpha * A1[1+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+		B[1+bs*1] += alpha * A[1+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+		B[1+bs*2] += alpha * A[1+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+		B[1+bs*3] += alpha * A[1+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	double *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		B[0+bs*1] += alpha * A0[3+bs*1];
+		B[1+bs*1] += alpha * A1[0+bs*1];
+
+		B[0+bs*2] += alpha * A0[3+bs*2];
+		B[1+bs*2] += alpha * A1[0+bs*2];
+
+		B[0+bs*3] += alpha * A0[3+bs*3];
+		B[1+bs*3] += alpha * A1[0+bs*3];
+
+		A0 += 16;
+		A1 += 16;
+		B  += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		B[0+bs*1] += alpha * A[0+bs*1];
+
+		B[0+bs*2] += alpha * A[0+bs*2];
+
+		B[0+bs*3] += alpha * A[0+bs*3];
+
+		A += 16;
+		B += 16;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
diff --git a/auxiliary/c99/kernel_dgetr_lib4.c b/auxiliary/c99/kernel_dgetr_lib4.c
new file mode 100644
index 0000000..7d62277
--- /dev/null
+++ b/auxiliary/c99/kernel_dgetr_lib4.c
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+		C[1+bs*3] = alpha * A[3+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+		C[2+bs*3] = alpha * A[3+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+		C[3+bs*3] = alpha * A[3+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+// transposed of general matrices, read across panels, write along panels
+void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
+	{
+	const int ps = 4;
+	int k;
+	for(k=0; k<kmax-3; k+=4)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[0+ps*1] = A[1+ps*0];
+		B[0+ps*2] = A[2+ps*0];
+		B[0+ps*3] = A[3+ps*0];
+		//
+		B[1+ps*0] = A[0+ps*1];
+		B[1+ps*1] = A[1+ps*1];
+		B[1+ps*2] = A[2+ps*1];
+		B[1+ps*3] = A[3+ps*1];
+		//
+		B[2+ps*0] = A[0+ps*2];
+		B[2+ps*1] = A[1+ps*2];
+		B[2+ps*2] = A[2+ps*2];
+		B[2+ps*3] = A[3+ps*2];
+		//
+		B[3+ps*0] = A[0+ps*3];
+		B[3+ps*1] = A[1+ps*3];
+		B[3+ps*2] = A[2+ps*3];
+		B[3+ps*3] = A[3+ps*3];
+
+		A += ps*sda;
+		B += ps*ps;
+		}
+	for( ; k<kmax; k++)
+		{
+		//
+		B[0+ps*0] = A[0+ps*0];
+		B[1+ps*0] = A[0+ps*1];
+		B[2+ps*0] = A[0+ps*2];
+		B[3+ps*0] = A[0+ps*3];
+
+		A += 1;
+		B += ps;
+		}
+	return;
+	}
+
diff --git a/auxiliary/c99/kernel_sgetr_lib4.c b/auxiliary/c99/kernel_sgetr_lib4.c
new file mode 100644
index 0000000..4cf6fa2
--- /dev/null
+++ b/auxiliary/c99/kernel_sgetr_lib4.c
@@ -0,0 +1,370 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 4-wide + end 3x3 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+		C[1+bs*3] = alpha * A[3+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+		C[2+bs*3] = alpha * A[3+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+		C[3+bs*3] = alpha * A[3+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+		C[0+bs*3] = alpha * A[3+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 3x3 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else if(kna==2)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[0+bs*3] = alpha * A[3+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			C[1+bs*3] = alpha * A[3+bs*1];
+			C[2+bs*3] = alpha * A[3+bs*2];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 3-wide + end 2x2 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+		C[1+bs*2] = alpha * A[2+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+		C[2+bs*2] = alpha * A[2+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+		C[3+bs*2] = alpha * A[2+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+		C[0+bs*2] = alpha * A[2+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	if(tri==1)
+		{
+		// end 2x2 triangle
+		kna = (bs-(bs-kna+kmax)%bs)%bs;
+
+		if(kna==1)
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
+			}
+		else
+			{
+			C[0+bs*1] = alpha * A[1+bs*0];
+			C[0+bs*2] = alpha * A[2+bs*0];
+			C[1+bs*2] = alpha * A[2+bs*1];
+			}
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 2-wide + end 1x1 triangle
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+			C[0+bs*1] = alpha * A[1+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+		C[1+bs*1] = alpha * A[1+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+		C[2+bs*1] = alpha * A[1+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+		C[3+bs*1] = alpha * A[1+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+		C[0+bs*1] = alpha * A[1+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+	
+	if(tri==1)
+		{
+		// end 1x1 triangle
+		C[0+bs*1] = alpha * A[1+bs*0];
+		}
+
+	}
+
+
+
+// transposed of general matrices, read along panels, write across panels
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
+	{
+
+	if(tri==1)
+		{
+		// A is lower triangular, C is upper triangular
+		// kmax+1 1-wide
+
+		kmax += 1;
+		}
+
+	const int bs = 4;
+	
+	int k;
+
+	k = 0;
+
+	if(kmax<kna)
+		goto cleanup_loop;
+
+	if(kna>0)
+		{
+		for( ; k<kna; k++)
+			{
+			C[0+bs*0] = alpha * A[0+bs*0];
+
+			C += 1;
+			A += bs;
+			}
+		C += bs*(sdc-1);
+		}
+	
+	for( ; k<kmax-3; k+=4)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C[1+bs*0] = alpha * A[0+bs*1];
+
+		C[2+bs*0] = alpha * A[0+bs*2];
+
+		C[3+bs*0] = alpha * A[0+bs*3];
+
+		C += bs*sdc;
+		A += bs*bs;
+		}
+	
+	cleanup_loop:
+
+	for( ; k<kmax; k++)
+		{
+		C[0+bs*0] = alpha * A[0+bs*0];
+
+		C += 1;
+		A += bs;
+		}
+
+	}
+
+
+
+
diff --git a/auxiliary/d_aux_ext_dep_lib.c b/auxiliary/d_aux_ext_dep_lib.c
new file mode 100644
index 0000000..c12da10
--- /dev/null
+++ b/auxiliary/d_aux_ext_dep_lib.c
@@ -0,0 +1,632 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void d_zeros(double **pA, int row, int col)
+	{
+	*pA = malloc((row*col)*sizeof(double));
+	double *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void d_zeros_align(double **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (double *) _aligned_malloc( (row*col)*sizeof(double), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(double));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	double *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void d_free(double *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void d_free_align(double *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void d_print_mat(int m, int n, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_mat(int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints a matrix in column-major format */
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void d_print_e_mat(int m, int n, double *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%1.15e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void d_print_e_tran_mat(int row, int col, double *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+	{
+	const int bs = D_PS;
+	int nc = D_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	d_zeros_align(&(sA->pA), sA->pm, sA->cn);
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	d_zeros_align(&(sA->dA), tmp, 1);
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+	{
+	d_free_align(sA->pA);
+	d_free_align(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void d_allocate_strvec(int m, struct d_strvec *sa)
+	{
+	const int bs = D_PS;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	d_zeros_align(&(sa->pa), sa->pm, 1);
+	sa->memory_size = pm*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strvec(struct d_strvec *sa)
+	{
+	d_free_align(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE * file, int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j]);
+				}
+			fprintf(file, "\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	fprintf(file, "\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_tran_to_file_strvec(FILE * file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = D_PS;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void d_allocate_strmat(int m, int n, struct d_strmat *sA)
+	{
+	sA->m = m;
+	sA->n = n;
+	d_zeros(&(sA->pA), sA->m, sA->n);
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	d_zeros(&(sA->dA), tmp, 1);
+	sA->memory_size = (m*n+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void d_free_strmat(struct d_strmat *sA)
+	{
+	free(sA->pA);
+	free(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m
+void d_allocate_strvec(int m, struct d_strvec *sa)
+	{
+	sa->m = m;
+	d_zeros(&(sa->pa), sa->m, 1);
+	sa->memory_size = m*sizeof(double);
+	return;
+	}
+
+
+
+// free memory of a vector structure
+void d_free_strvec(struct d_strvec *sa)
+	{
+	free(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_to_file_mat(file, m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_to_file_tran_strvec(FILE *file, int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	d_print_e_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	d_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/d_aux_lib.c b/auxiliary/d_aux_lib.c
new file mode 100644
index 0000000..6f1f5d1
--- /dev/null
+++ b/auxiliary/d_aux_lib.c
@@ -0,0 +1,982 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+	{
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	int size = (m*n+tmp)*sizeof(double);
+	return size;
+	}
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n)
+	{
+	int size = 0;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	size = tmp*sizeof(double);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+	{
+	sA->m = m;
+	sA->n = n;
+	double *ptr = (double *) memory;
+	sA->pA = ptr;
+	ptr += m*n;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (m*n+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+	{
+	int size = m*sizeof(double);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+	{
+	sa->m = m;
+	double *ptr = (double *) memory;
+	sa->pa = ptr;
+//	ptr += m * n;
+	sa->memory_size = m*sizeof(double);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+			pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+			pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+			pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+			pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+			A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+			A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	double *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+			A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+			A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pA[ii+lda*jj] = alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha;
+	return;
+	}
+
+
+
+// extract a row into a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*lda];
+	return;
+	}
+
+
+
+// insert a vector  into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] += alpha*x[ii];
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*lda;
+	int ii;
+	double tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii*lda];
+		pA[ii*lda] = pC[ii*ldc];
+		pC[ii*ldc] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = pA[ii];
+	return;
+	}
+
+
+
+// insert a vector  into a rcol
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii] = x[ii];
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*lda;
+	int ii;
+	double tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii];
+		pA[ii] = pC[ii];
+		pC[ii] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			pA[ii+1+jj*lda] *= alpha;
+			pA[ii+2+jj*lda] *= alpha;
+			pA[ii+3+jj*lda] *= alpha;
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+			pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+			pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	double *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = 0;
+		for(; ii<=jj; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*(lda+1)];
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii*(ldd+1)];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	double *pA = sA->pA + ai + aj*lda;
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha*x[ii];
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	double *x = sx->pa + xi;
+	int ldd = sD->m;
+	double *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*ldd] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+// clip without mask return
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+	{
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+	return;
+	}
+
+
+
+// clip with mask return
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+	{
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	double *mask  = sm->pa + mi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+	return;
+	}
+
+
+// zero out components using mask
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+	{
+	double *mask = sm->pa + mi;
+	double *v = sv->pa + vi;
+	double *e = se->pa + ei;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+	return;
+	}
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+	{
+	int ii;
+	double *x = sx->pa + xi;
+	double norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/d_aux_lib4.c b/auxiliary/d_aux_lib4.c
new file mode 100644
index 0000000..152aed1
--- /dev/null
+++ b/auxiliary/d_aux_lib4.c
@@ -0,0 +1,3609 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+// copies a packed matrix into a packed matrix
+// TODO remove alha !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_0_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_0_lib4(0, n, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(0, n, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_1_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_1_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_2_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_2_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_2_lib4(0, n, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(0, n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_3_lib4(0, n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_3_lib4(0, n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(0, n, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_3_lib4(0, n, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_3_lib4(0, n, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0)
+		return;
+
+	int n = m;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_0_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_0_lib4(1, ii, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_1_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_1_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_2_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_2_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_2_lib4(1, ii, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgecp_2_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgecp_3_0_lib4(1, ii, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgecp_8_3_lib4(1, ii, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgecp_4_3_lib4(1, ii, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgecp_1_0_lib4(1, ii, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgecp_2_3_lib4(1, ii, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgecp_3_3_lib4(1, ii, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// scales and adds a packed matrix into a packed matrix: B = B + alpha*A
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int offA = offsetA%bs;
+	int offB = offsetB%bs;
+
+	// A at the beginning of the block
+	A -= offA;
+
+	// A at the beginning of the block
+	B -= offB;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_0_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_0_lib4(n, alpha, A, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_0_lib4(n, alpha, A, B);
+			}
+		}
+	// skip one element of A
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				//A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_3_lib4(n, alpha, A, sda, B+2);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_2_lib4(n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_1_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_1_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+1, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A+1, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_0_lib4(n, alpha, A+1, B);
+			}
+		}
+	// skip 2 elements of A
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgead_2_3_lib4(n, alpha, A, sda, B+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+1, B+3);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A, B+2);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_3_lib4(n, alpha, A, sda, B+1);
+				A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_2_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_2_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+2, B);
+			else if(m-ii==2)
+				kernel_dgead_2_0_lib4(n, alpha, A+2, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_2_lib4(n, alpha, A, sda, B);
+			}
+		}
+	// skip 3 elements of A
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_dgead_1_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_dgead_2_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_dgead_3_0_lib4(n, alpha, A+offA, B+offB);
+				// A += 4*sda;
+				B += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-7; ii+=8)
+			{
+			kernel_dgead_8_3_lib4(n, alpha, A, sda, B, sdb);
+			A += 8*sda;
+			B += 8*sdb;
+			}
+#endif
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_dgead_4_3_lib4(n, alpha, A, sda, B);
+			A += 4*sda;
+			B += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_dgead_1_0_lib4(n, alpha, A+3, B);
+			else if(m-ii==2)
+				kernel_dgead_2_3_lib4(n, alpha, A, sda, B);
+			else // if(m-ii==3)
+				kernel_dgead_3_3_lib4(n, alpha, A, sda, B);
+			}
+		}
+
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A =
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else if(mna==2)
+			kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else //if(mna==3)
+			kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+//	for( ; ii<m; ii+=4)
+		{
+		kernel_dgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_dgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_dgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+
+	return;
+
+	}
+
+
+
+// transpose lower triangular matrix
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+A =
+ x
+ x x
+ x x x
+ x x x x
+
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+
+   x x x x x x x
+     x x x x x x
+	   x x x x x
+	     x x x x
+
+	       x x x
+	         x x
+	           x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			pC[0] = alpha * pA[0];
+			}
+		else if(mna==2)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if 0 //defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(1, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+		{
+		kernel_dgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		kernel_dgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_dgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_dgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+
+	return;
+
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+/*
+
+A =
+ x x x x x x x x
+   x x x x x x x
+
+     x x x x x x
+       x x x x x
+         x x x x
+           x x x
+             x x
+               x
+
+C =
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	int tna = nna;
+
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			kernel_dgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+			if(nna!=1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1;
+				tna = (bs-(offsetC+1)%bs)%bs;
+				}
+			else //if(nna==1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+1)%bs)%bs;
+				}
+//			kernel_dgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+			}
+		else if(mna==2)
+			{
+			if(nna==0 || nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = (bs-(offsetC+2)%bs)%bs;
+				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+				kernel_dgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3; //(bs-(offsetC+2)%bs)%bs;
+//				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+2)%bs)%bs;
+				kernel_dgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==0)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3;
+				tna = 1;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += bs;
+				pC += 1 + (sdc-1)*bs;
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = 2;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+//				pC[0+bs*2] = alpha * pA[2+bs*0];
+				kernel_dgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3;
+//				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else //if(nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3 + (sdc-1)*bs;
+				tna = 0;
+				kernel_dgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+#if 0 //defined(TARGET_X64_AVX2)
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_dgetr_8_lib4(0, n, nna, alpha, pA, sda, pC, sdc);
+		pA += 2*bs*sda;
+		pC += 2*bs*bs;
+		}
+#endif
+	for( ; ii<m-3; ii+=4)
+		{
+		if(tna==0)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[3+bs*0] = alpha * pA[0+bs*3];
+			pC[3+bs*1] = alpha * pA[1+bs*3];
+			pC[3+bs*2] = alpha * pA[2+bs*3];
+			pC[3+bs*3] = alpha * pA[3+bs*3];
+			pA += 4*bs;
+			pC += sdc*bs;
+			kernel_dgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[2+bs*3] = alpha * pA[3+bs*2];
+			pA += 3*bs;
+			pC += 3;
+			kernel_dgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+			}
+		else if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[1+bs*3] = alpha * pA[3+bs*1];
+			pA += 2*bs;
+			pC += 2;
+			kernel_dgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+			}
+		else //if(tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pA += 3*bs;
+			pC += 3 + (sdc-1)*bs;
+			kernel_dgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+//			pC[0+bs*0] = alpha * pA[0+bs*0];
+//			pC[0+bs*1] = alpha * pA[1+bs*0];
+//			pC[0+bs*2] = alpha * pA[2+bs*0];
+//			pC[0+bs*3] = alpha * pA[3+bs*0];
+			pA += bs;
+			pC += 1;
+//			kernel_dgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+			}
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end
+	if(ii==m)
+		return;
+
+	if(m-ii==1)
+		{
+		pC[0+bs*0] = alpha * pA[0+bs*0];
+		}
+	else if(m-ii==2)
+		{
+		if(tna!=1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			}
+		else //if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			}
+		}
+	else if(m-ii==3)
+		{
+		if(tna==0 || tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			}
+		else //if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// regularize diagonal
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += reg;
+		pD[jj*sdd+(jj+1)*bs+1] += reg;
+		pD[jj*sdd+(jj+2)*bs+2] += reg;
+		pD[jj*sdd+(jj+3)*bs+3] += reg;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+
+	}
+
+
+
+// insert sqrt of vector to diagonal
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+
+	}
+
+
+
+// extract diagonal to vector
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+
+	}
+
+
+
+// insert vector to diagonal, sparse formulation
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+
+	}
+
+
+
+// extract diagonal to vector, sparse formulation
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+
+	}
+
+
+
+// insert vector to row
+void drowin_lib(int kmax, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] = alpha*x[jj+0];
+		pD[(jj+1)*bs] = alpha*x[jj+1];
+		pD[(jj+2)*bs] = alpha*x[jj+2];
+		pD[(jj+3)*bs] = alpha*x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] = alpha*x[jj];
+		}
+
+	}
+
+
+
+// extract row to vector
+void drowex_lib(int kmax, double alpha, double *pD, double *x)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pD[(jj+0)*bs];
+		x[jj+1] = alpha*pD[(jj+1)*bs];
+		x[jj+2] = alpha*pD[(jj+2)*bs];
+		x[jj+3] = alpha*pD[(jj+3)*bs];
+		}
+	for(; jj<kmax; jj++)
+		{
+		x[jj] = alpha*pD[(jj)*bs];
+		}
+
+	}
+
+
+
+// add scaled vector to row
+void drowad_lib(int kmax, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] += alpha * x[jj+0];
+		pD[(jj+1)*bs] += alpha * x[jj+1];
+		pD[(jj+2)*bs] += alpha * x[jj+2];
+		pD[(jj+3)*bs] += alpha * x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// insert vector to row, sparse formulation
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to row, sparse formulation
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+
+	}
+
+
+
+// swap two rows
+void drowsw_lib(int kmax, double *pA, double *pC)
+	{
+
+	const int bs = 4;
+
+	int ii;
+	double tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+
+	}
+
+
+
+// extract vector from column
+void dcolex_lib(int kmax, int offset, double *pD, int sdd, double *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = pD[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = pD[jj*sdd+0];
+		x[jj+1] = pD[jj*sdd+1];
+		x[jj+2] = pD[jj*sdd+2];
+		x[jj+3] = pD[jj*sdd+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = pD[jj*sdd+ll];
+		}
+
+	}
+
+
+
+// insert vector to column
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] = x[jj+0];
+		pD[jj*sdd+1] = x[jj+1];
+		pD[jj*sdd+2] = x[jj+2];
+		pD[jj*sdd+3] = x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] = x[jj+ll];
+		}
+
+	}
+
+
+
+// add scaled vector to column
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] += alpha * x[jj+0];
+		pD[jj*sdd+1] += alpha * x[jj+1];
+		pD[jj*sdd+2] += alpha * x[jj+2];
+		pD[jj*sdd+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] += alpha * x[jj+ll];
+		}
+
+	}
+
+
+
+// insert vector to diagonal, sparse formulation
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+
+	}
+
+
+
+// swaps two cols
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc)
+	{
+
+	const int bs = 4;
+
+	int ii;
+
+	double tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-3; ii+=4)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\ndcolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void dvecin_libsp(int kmax, int *idx, double *x, double *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int d_size_diag_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory)
+	{
+	const int bs = 4;
+	int nc = D_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	double *ptr = (double *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(double);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int d_size_strvec(int m)
+	{
+	const int bs = 4;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(double);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void d_create_strvec(int m, struct d_strvec *sa, void *memory)
+	{
+	const int bs = 4;
+//	int nc = D_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	double *ptr = (double *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(double);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	double 	*B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m256d
+		tmp;
+#endif
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for( ; ii<m-3; ii+=4)
+			{
+			tmp = _mm256_loadu_pd( &B[0+lda*0] );
+			_mm256_store_pd( &pB[0+bs*0], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*1] );
+			_mm256_store_pd( &pB[0+bs*1], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*2] );
+			_mm256_store_pd( &pB[0+bs*2], tmp );
+			tmp = _mm256_loadu_pd( &B[0+lda*3] );
+			_mm256_store_pd( &pB[0+bs*3], tmp );
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+#else
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+#endif
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	double 	*B, *pB;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m256d
+		v0, v1, v2, v3,
+		v4, v5, v6, v7;
+#endif
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-3; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; j<m-3; j+=4)
+			{
+			v0 = _mm256_loadu_pd( &B[0+0*lda] ); // 00 10 20 30
+			v1 = _mm256_loadu_pd( &B[0+1*lda] ); // 01 11 21 31
+			v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
+			v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
+			v2 = _mm256_loadu_pd( &B[0+2*lda] ); // 02 12 22 32
+			v3 = _mm256_loadu_pd( &B[0+3*lda] ); // 03 13 23 33
+			v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
+			v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
+
+			B += 4;
+
+			v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
+			_mm256_store_pd( &pB[0+bs*0], v0 );
+			v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
+			_mm256_store_pd( &pB[0+bs*2], v2 );
+			v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
+			_mm256_store_pd( &pB[0+bs*1], v1 );
+			v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
+			_mm256_store_pd( &pB[0+bs*3], v3 );
+
+			pB += 4*bs;
+			}
+#else
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+#endif
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	double *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*jj] = ptr_pA[0];
+			A[1+ii+lda*jj] = ptr_pA[1];
+			A[2+ii+lda*jj] = ptr_pA[2];
+			A[3+ii+lda*jj] = ptr_pA[3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	double *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void d_cast_mat2strmat(double *A, struct d_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+double dvecex1_libstr(struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi)
+	{
+	double *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] = alpha*x[ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] = alpha*x[jj+0];
+		pA[jj*sda+(jj+1)*bs+1] = alpha*x[jj+1];
+		pA[jj*sda+(jj+2)*bs+2] = alpha*x[jj+2];
+		pA[jj*sda+(jj+3)*bs+3] = alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// add scalar to diagonal
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] += alpha;
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] += alpha;
+		pA[jj*sda+(jj+1)*bs+1] += alpha;
+		pA[jj*sda+(jj+2)*bs+2] += alpha;
+		pA[jj*sda+(jj+3)*bs+3] += alpha;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] += alpha;
+		}
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	drowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			drowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	drowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// extract vector from column
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	dcolex_lib(kmax, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+
+// insert as vector as a column
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	dcolin_lib(kmax, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+
+// swap two cols of a matrix struct
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dcolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			dcolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgecp_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// scale a generic strmat
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	dgecp_lib(m, n, alpha, ai%bs, pA, sda, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci)
+	{
+	double *pa = sa->pa + ai;
+	double *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai)
+	{
+	double *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrcp_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgead_lib(m, n, alpha, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	double *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	dtrtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract a vector from diagonal
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha*pA[ll+bs*ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pA[jj*sda+(jj+0)*bs+0];
+		x[jj+1] = alpha*pA[jj*sda+(jj+1)*bs+1];
+		x[jj+2] = alpha*pA[jj*sda+(jj+2)*bs+2];
+		x[jj+3] = alpha*pA[jj*sda+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha*pA[jj*sda+(jj+ll)*bs+ll];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	double *x = sx->pa + xi;
+	int offsetA = ai%bs;
+
+	int kna = (bs-offsetA%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pA[ll+bs*ll] += alpha*x[ll];
+			}
+		pA += kna + bs*(sda-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pA[jj*sda+(jj+0)*bs+0] += alpha*x[jj+0];
+		pA[jj*sda+(jj+1)*bs+1] += alpha*x[jj+1];
+		pA[jj*sda+(jj+2)*bs+2] += alpha*x[jj+2];
+		pA[jj*sda+(jj+3)*bs+3] += alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pA[jj*sda+(jj+ll)*bs+ll] += alpha*x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	int sdd = sD->cn;
+	double *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	double *x = sx->pa + xi;
+	int sdd = sD->cn;
+	double *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	drowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi)
+	{
+
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		xm0, x0, xp0, z0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		_mm256_storeu_pd( &z[ii], z0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask1, mask2 );
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi)
+	{
+
+	double *xm = sxm->pa + xim;
+	double *x  = sx->pa + xi;
+	double *xp = sxp->pa + xip;
+	double *z  = sz->pa + zi;
+	double *mask  = sm->pa + mi;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		xm0, x0, xp0, z0, mask0, tmp0, tmp1, ones, mones, mask1, mask2;
+
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+	mones = _mm256_set_pd( -1.0, -1.0, -1.0, -1.0 );
+	mask1 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		mask0 = _mm256_setzero_pd();
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+		_mm256_storeu_pd( &z[ii], z0 );
+		_mm256_storeu_pd( &mask[ii], mask0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask1, mask2 );
+		mask0 = _mm256_setzero_pd();
+		x0  = _mm256_loadu_pd( &x[ii] );
+		xp0 = _mm256_loadu_pd( &xp[ii] );
+		xm0 = _mm256_loadu_pd( &xm[ii] );
+		tmp0 = _mm256_cmp_pd( xp0, x0, 0x2 );
+		tmp1 = _mm256_cmp_pd( x0, xm0, 0x2 );
+		z0 = _mm256_blendv_pd( x0, xp0, tmp0 );
+		z0 = _mm256_blendv_pd( z0, xm0, tmp1 );
+		mask0 = _mm256_blendv_pd( mask0, ones, tmp0 );
+		mask0 = _mm256_blendv_pd( mask0, mones, tmp1 );
+		_mm256_maskstore_pd( &z[ii], _mm256_castpd_si256( mask2 ), z0 );
+		_mm256_maskstore_pd( &mask[ii], _mm256_castpd_si256( mask2 ), mask0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei)
+	{
+	double *mask = sm->pa + mi;
+	double *v = sv->pa + vi;
+	double *e = se->pa + ei;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	double d0;
+
+	__m256d
+		mask0, mask1, mask2, mask3, fives, zeros, e0, v0;
+
+	fives = _mm256_set_pd( 0.5, 0.5, 0.5, 0.5 );
+	zeros = _mm256_setzero_pd();
+	mask3 = _mm256_set_pd( 3.5, 2.5, 1.5, 0.5 );
+
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		v0 = _mm256_loadu_pd( &v[ii] );
+		mask0 = _mm256_loadu_pd( &mask[ii] );
+		mask1 = mask0;
+		mask0 = _mm256_sub_pd( mask0, fives);
+		mask1 = _mm256_add_pd( mask1, fives);
+		mask0 = _mm256_xor_pd( mask0, mask1);
+		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+		_mm256_storeu_pd( &e[ii], e0 );
+		}
+	if(ii<m)
+		{
+		d0 = (double) m-ii;
+		mask2 = _mm256_broadcast_sd( &d0 );
+		mask2 = _mm256_sub_pd( mask3, mask2 );
+		v0 = _mm256_loadu_pd( &v[ii] );
+		mask0 = _mm256_loadu_pd( &mask[ii] );
+		mask1 = mask0;
+		mask0 = _mm256_sub_pd( mask0, fives);
+		mask1 = _mm256_add_pd( mask1, fives);
+		mask0 = _mm256_xor_pd( mask0, mask1);
+		e0 = _mm256_blendv_pd( zeros, v0, mask0 );
+		_mm256_maskstore_pd( &e[ii], _mm256_castpd_si256( mask2 ), e0 );
+		}
+#else
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+#endif
+
+	}
+
+
+
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm)
+	{
+	int ii;
+	double *x = sx->pa + xi;
+	double norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/i_aux_ext_dep_lib.c b/auxiliary/i_aux_ext_dep_lib.c
new file mode 100644
index 0000000..1ca2292
--- /dev/null
+++ b/auxiliary/i_aux_ext_dep_lib.c
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix aligned */
+void int_zeros(int **pA, int row, int col)
+	{
+	void *temp = malloc((row*col)*sizeof(int));
+	*pA = temp;
+	int *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void int_zeros_align(int **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (int *) _aligned_malloc( (row*col)*sizeof(int), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(int));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	int *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void int_free(int *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void int_free_align(int *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void int_print_mat(int row, int col, int *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			printf("%d ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
diff --git a/auxiliary/m_aux_lib.c b/auxiliary/m_aux_lib.c
new file mode 100644
index 0000000..30cb333
--- /dev/null
+++ b/auxiliary/m_aux_lib.c
@@ -0,0 +1,112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+	int lda = Md->m;
+	int ldb = Ms->m;
+	double *pA = Md->pA+mid+nid*lda;
+	float *pB = Ms->pA+mis+nis*ldb;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pB[ii+jj*ldb] = (float) pA[ii+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	int lda = Ms->m;
+	int ldb = Md->m;
+	float *pA = Ms->pA+mis+nis*lda;
+	double *pB = Md->pA+mid+nid*ldb;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pB[ii+jj*ldb] = (double) pA[ii+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/auxiliary/m_aux_lib44.c b/auxiliary/m_aux_lib44.c
new file mode 100644
index 0000000..a17d545
--- /dev/null
+++ b/auxiliary/m_aux_lib44.c
@@ -0,0 +1,93 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+	printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/m_aux_lib48.c b/auxiliary/m_aux_lib48.c
new file mode 100644
index 0000000..e9fdcd2
--- /dev/null
+++ b/auxiliary/m_aux_lib48.c
@@ -0,0 +1,153 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		ps[ii] = (float) pd[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi)
+	{
+	double *pd = vd->pa+vdi;
+	float *ps = vs->pa+vsi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		pd[ii] = (double) ps[ii];
+		}
+	return;
+	}
+
+
+
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis)
+	{
+//	printf("\nm_cvt_d2s_strmat: feature not implmeneted yet\n\n");
+//	exit(1);
+	if(mid!=0 | mis!=0)
+		{
+		printf("\nm_cvt_d2s_strmat: feature not implmeneted yet: mid=%d, mis=%d\n\n", mid, mis);
+		exit(1);
+		}
+	const int psd = 4;
+	const int pss = 8;
+	const int sdd = Md->cn;
+	double *D0 = Md->pA + nid*psd;
+	double *D1;
+	const int sds = Ms->cn;
+	float *S = Ms->pA + nis*pss;
+	int ii, jj, ll;
+	for(ii=0; ii<m-7; ii+=8)
+		{
+		D1 = D0 + psd*sdd;
+		for(jj=0; jj<n; jj++)
+			{
+			S[0+jj*pss] = (float) D0[0+jj*psd];
+			S[1+jj*pss] = (float) D0[1+jj*psd];
+			S[2+jj*pss] = (float) D0[2+jj*psd];
+			S[3+jj*pss] = (float) D0[3+jj*psd];
+			S[4+jj*pss] = (float) D1[0+jj*psd];
+			S[5+jj*pss] = (float) D1[1+jj*psd];
+			S[6+jj*pss] = (float) D1[2+jj*psd];
+			S[7+jj*pss] = (float) D1[3+jj*psd];
+			}
+		D0 += 8*sdd;
+		S  += 8*sds;
+		}
+	if(m-ii>0)
+		{
+		if(m-ii<4)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				for(ll=0; ll<m-ii; ll++)
+					{
+					S[ll+jj*pss] = (float) D0[ll+jj*psd];
+					}
+				}
+			return;
+			}
+		else
+			{
+			D1 = D0 + psd*sdd;
+			for(jj=0; jj<n; jj++)
+				{
+				S[0+jj*pss] = (float) D0[0+jj*psd];
+				S[1+jj*pss] = (float) D0[1+jj*psd];
+				S[2+jj*pss] = (float) D0[2+jj*psd];
+				S[3+jj*pss] = (float) D0[3+jj*psd];
+				for(ll=0; ll<m-ii-4; ll++)
+					{
+					S[4+ll+jj*pss] = (float) D1[ll+jj*psd];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid)
+	{
+	printf("\nm_cvt_s2d_strmat: feature not implmeneted yet\n\n");
+	exit(1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/auxiliary/s_aux_ext_dep_lib.c b/auxiliary/s_aux_ext_dep_lib.c
new file mode 100644
index 0000000..85f7ebc
--- /dev/null
+++ b/auxiliary/s_aux_ext_dep_lib.c
@@ -0,0 +1,633 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if ! defined(OS_WINDOWS)
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+#endif
+
+
+
+/* creates a zero matrix */
+void s_zeros(float **pA, int row, int col)
+	{
+	*pA = malloc((row*col)*sizeof(float));
+	float *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line */
+void s_zeros_align(float **pA, int row, int col)
+	{
+#if defined(OS_WINDOWS)
+	*pA = (float *) _aligned_malloc( (row*col)*sizeof(float), 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, (row*col)*sizeof(float));
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*pA = temp;
+#endif
+	float *A = *pA;
+	int i;
+	for(i=0; i<row*col; i++) A[i] = 0.0;
+	}
+
+
+
+/* frees matrix */
+void s_free(float *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void s_free_align(float *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* prints a matrix in column-major format */
+void s_print_mat(int m, int n, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_mat(int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%9.5f ", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints a matrix in column-major format */
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<row; i++)
+		{
+		for(j=0; j<col; j++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format */
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			fprintf(file, "%9.5f ", A[i+lda*j]);
+			}
+		fprintf(file, "\n");
+		}
+	fprintf(file, "\n");
+	}	
+
+
+
+/* prints a matrix in column-major format (exponential notation) */
+void s_print_e_mat(int m, int n, float *A, int lda)
+	{
+	int i, j;
+	for(i=0; i<m; i++)
+		{
+		for(j=0; j<n; j++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/* prints the transposed of a matrix in column-major format (exponential notation) */
+void s_print_e_tran_mat(int row, int col, float *A, int lda)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			printf("%e\t", A[i+lda*j]);
+			}
+		printf("\n");
+		}
+	printf("\n");
+	}	
+
+
+
+/****************************
+* new interface
+****************************/
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+#include "../include/blasfeo_block_size.h"
+
+
+
+// create a matrix structure for a matrix of size m*n by dynamically allocating the memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+	{
+	const int bs = S_PS;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	s_zeros_align(&(sA->pA), sA->pm, sA->cn);
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	s_zeros_align(&(sA->dA), tmp, 1);
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+	{
+	s_free_align(sA->pA);
+	s_free_align(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m by dynamically allocating the memory
+void s_allocate_strvec(int m, struct s_strvec *sa)
+	{
+	const int bs = S_PS;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	s_zeros_align(&(sa->pa), sa->pm, 1);
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strvec(struct s_strvec *sa)
+	{
+	s_free_align(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE * file, int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j]);
+				}
+			fprintf(file, "\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				fprintf(file, "%9.5f ", pA[i+bs*j+sda*ii]);
+				}
+			fprintf(file, "\n");
+			}
+		}
+	fprintf(file, "\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_tran_to_file_strvec(FILE * file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = S_PS;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int ii, i, j, tmp;
+	ii = 0;
+	if(ai%bs>0)
+		{
+		tmp = bs-ai%bs;
+		tmp = m<tmp ? m : tmp;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j]);
+				}
+			printf("\n");
+			}
+		pA += tmp + bs*(sda-1);
+		m -= tmp;
+		}
+	for( ; ii<m-(bs-1); ii+=bs)
+		{
+		for(i=0; i<bs; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	if(ii<m)
+		{
+		tmp = m-ii;
+		for(i=0; i<tmp; i++)
+			{
+			for(j=0; j<n; j++)
+				{
+				printf("%e\t", pA[i+bs*j+sda*ii]);
+				}
+			printf("\n");
+			}
+		}
+	printf("\n");
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print the transposed of a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+
+
+// create a matrix structure for a matrix of size m*n
+void s_allocate_strmat(int m, int n, struct s_strmat *sA)
+	{
+	sA->m = m;
+	sA->n = n;
+	s_zeros(&(sA->pA), sA->m, sA->n);
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	s_zeros(&(sA->dA), tmp, 1);
+	sA->memory_size = (m*n+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a matrix structure
+void s_free_strmat(struct s_strmat *sA)
+	{
+	free(sA->pA);
+	free(sA->dA);
+	return;
+	}
+
+
+
+// create a vector structure for a vector of size m
+void s_allocate_strvec(int m, struct s_strvec *sa)
+	{
+	sa->m = m;
+	s_zeros(&(sa->pa), sa->m, 1);
+	sa->memory_size = m*sizeof(float);
+	return;
+	}
+
+
+
+// free memory of a vector structure
+void s_free_strvec(struct s_strvec *sa)
+	{
+	free(sa->pa);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_to_file_mat(file, m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_to_file_tran_strvec(FILE *file, int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_to_file_mat(file, 1, m, pa, 1);
+	return;
+	}
+
+
+
+// print a matrix structure
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	s_print_e_mat(m, n, pA, lda);
+	return;
+	}
+
+
+
+// print a vector structure
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(m, 1, pa, m);
+	return;
+	}
+
+
+
+// print and transpose a vector structure
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	s_print_e_mat(1, m, pa, 1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib.c b/auxiliary/s_aux_lib.c
new file mode 100644
index 0000000..978eb9a
--- /dev/null
+++ b/auxiliary/s_aux_lib.c
@@ -0,0 +1,956 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS)
+
+
+
+// return memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	int size = (m*n+tmp)*sizeof(float);
+	return size;
+	}
+
+
+
+// return memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	int size = 0;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	size = tmp*sizeof(float);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	sA->m = m;
+	sA->n = n;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += m*n;
+	int tmp = m<n ? m : n; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (m*n+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	int size = m*sizeof(float);
+	return size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	sa->m = m;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += m * n;
+	sa->memory_size = m*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			pA[ii+1+jj*lda2] = A[ii+1+jj*lda];
+			pA[ii+2+jj*lda2] = A[ii+2+jj*lda];
+			pA[ii+3+jj*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			pA[jj+(ii+1)*lda2] = A[ii+1+jj*lda];
+			pA[jj+(ii+2)*lda2] = A[ii+2+jj*lda];
+			pA[jj+(ii+3)*lda2] = A[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pA[jj+(ii+0)*lda2] = A[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			A[ii+1+jj*lda] = pA[ii+1+jj*lda2];
+			A[ii+2+jj*lda] = pA[ii+2+jj*lda2];
+			A[ii+3+jj*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+0+jj*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	int ii, jj;
+	int lda2 = sA->m;
+	float *pA = sA->pA + ai + aj*lda2;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			A[jj+(ii+1)*lda] = pA[ii+1+jj*lda2];
+			A[jj+(ii+2)*lda] = pA[ii+2+jj*lda2];
+			A[jj+(ii+3)*lda] = pA[ii+3+jj*lda2];
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+(ii+0)*lda] = pA[ii+0+jj*lda2];
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			pA[ii+lda*jj] = alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*(lda+1)];
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] = alpha*x[ii];
+	return;
+	}
+
+
+
+// extract a row into a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		x[ii] = alpha*pA[ii*lda];
+	return;
+	}
+
+
+
+// insert a vector  into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] = alpha*x[ii];
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*lda] += alpha*x[ii];
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*lda;
+	int ii;
+	float tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii*lda];
+		pA[ii*lda] = pC[ii*ldc];
+		pC[ii*ldc] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+
+// insert a vector  into a rcol
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii] = x[ii];
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*lda;
+	int ii;
+	float tmp;
+	for(ii=0; ii<kmax; ii++)
+		{
+		tmp = pA[ii];
+		pA[ii] = pC[ii];
+		pC[ii] = tmp;
+		}
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] = pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] = pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			pA[ii+1+jj*lda] *= alpha;
+			pA[ii+2+jj*lda] *= alpha;
+			pA[ii+3+jj*lda] *= alpha;
+			}
+		for(; ii<m; ii++)
+			{
+			pA[ii+0+jj*lda] *= alpha;
+			}
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			pC[ii+1+jj*ldc] += alpha*pA[ii+1+jj*lda];
+			pC[ii+2+jj*ldc] += alpha*pA[ii+2+jj*lda];
+			pC[ii+3+jj*ldc] += alpha*pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[ii+0+jj*ldc] += alpha*pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-3; ii+=4)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			pC[jj+(ii+1)*ldc] = pA[ii+1+jj*lda];
+			pC[jj+(ii+2)*ldc] = pA[ii+2+jj*lda];
+			pC[jj+(ii+3)*ldc] = pA[ii+3+jj*lda];
+			}
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = jj;
+		for(; ii<m; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	int ldc = sC->m;
+	float *pC = sC->pA + ci + cj*ldc;
+	int ii, jj;
+	for(jj=0; jj<m; jj++)
+		{
+		ii = 0;
+		for(; ii<=jj; ii++)
+			{
+			pC[jj+(ii+0)*ldc] = pA[ii+0+jj*lda];
+			}
+		}
+	return;
+	}
+
+
+
+// insert a strvec to the diagonal of a strmat, sparse formulation
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat from a strvec , sparse formulation
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii*(ldd+1)];
+		}
+	return;
+	}
+
+
+
+// add a vector to diagonal
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	int lda = sA->m;
+	float *pA = sA->pA + ai + aj*lda;
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		pA[ii*(lda+1)] += alpha*x[ii];
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*(ldd+1)] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	float *x = sx->pa + xi;
+	int ldd = sD->m;
+	float *pD = sD->pA + di + dj*ldd;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*ldd] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] += alpha * x[ii];
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+// clip without mask return
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi)
+	{
+	float *xm = sxm->pa + xim;
+	float *x  = sx->pa + xi;
+	float *xp = sxp->pa + xip;
+	float *z  = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			}
+		else
+			{
+			z[ii] = x[ii];
+			}
+		}
+	return;
+	}
+
+
+
+// clip with mask return
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi)
+	{
+	float *xm = sxm->pa + xim;
+	float *x  = sx->pa + xi;
+	float *xp = sxp->pa + xip;
+	float *z  = sz->pa + zi;
+	float *mask  = sm->pa + mi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(x[ii]>=xp[ii])
+			{
+			z[ii] = xp[ii];
+			mask[ii] = 1.0;
+			}
+		else if(x[ii]<=xm[ii])
+			{
+			z[ii] = xm[ii];
+			mask[ii] = -1.0;
+			}
+		else
+			{
+			z[ii] = x[ii];
+			mask[ii] = 0.0;
+			}
+		}
+	return;
+	}
+
+
+// zero out components using mask
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei)
+	{
+	float *mask = sm->pa + mi;
+	float *v = sv->pa + vi;
+	float *e = se->pa + ei;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		{
+		if(mask[ii]==0)
+			{
+			e[ii] = v[ii];
+			}
+		else
+			{
+			e[ii] = 0;
+			}
+		}
+	return;
+	}
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+	{
+	int ii;
+	float *x = sx->pa + xi;
+	float norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/auxiliary/s_aux_lib4.c b/auxiliary/s_aux_lib4.c
new file mode 100644
index 0000000..12acc47
--- /dev/null
+++ b/auxiliary/s_aux_lib4.c
@@ -0,0 +1,3107 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float *alphap, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float alpha = alphap[0];
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose general matrix; m and n are referred to the original matrix
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+m = 5
+n = 3
+offsetA = 1
+offsetC = 2
+
+A = 
+ x x x
+ -
+ x x x
+ x x x
+ x x x
+ x x x
+
+C =
+ x x x x x
+ x x x x x
+ -
+ x x x x x
+
+*/
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else if(mna==2)
+			kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+		else //if(mna==3)
+			kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+//	for( ; ii<m; ii+=4)
+		{
+		kernel_sgetr_4_lib4(0, n, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_sgetr_2_lib4(0, n, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_sgetr_3_lib4(0, n, nna, alpha, pA, pC, sdc);
+		
+	return;
+	
+	}	
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+A = 
+ x
+ x x
+ x x x
+ x x x x
+  
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+C =
+ x x x x x x x x
+  
+   x x x x x x x
+     x x x x x x
+	   x x x x x
+	     x x x x
+
+	       x x x
+	         x x
+	           x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			pC[0] = alpha * pA[0];
+			}
+		else if(mna==2)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*(0+sdc)] = alpha * pA[1+bs*1];
+				pC[1+bs*(1+sdc)] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*(1+sdc)] = alpha * pA[2+bs*2];
+				}
+			else
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[0+bs*2] = alpha * pA[2+bs*0];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+		{
+		kernel_sgetr_4_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+	
+	// clean-up at the end using smaller kernels
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		kernel_sgetr_1_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==2)
+		kernel_sgetr_2_lib4(1, ii, nna, alpha, pA, pC, sdc);
+	else if(m-ii==3)
+		kernel_sgetr_3_lib4(1, ii, nna, alpha, pA, pC, sdc);
+		
+	return;
+
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+/*
+
+A = 
+ x x x x x x x x
+   x x x x x x x
+
+     x x x x x x
+       x x x x x
+         x x x x
+           x x x
+             x x
+               x
+
+C = 
+ x
+
+ x x
+ x x x
+ x x x x
+ x x x x x
+ x x x x x x
+ x x x x x x x
+ x x x x x x x x
+
+*/
+
+	int n = m;
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int mna = (bs-offsetA%bs)%bs;
+	mna = m<mna ? m : mna;
+	int nna = (bs-offsetC%bs)%bs;
+	nna = n<nna ? n : nna;
+	int tna = nna;
+	
+	int ii;
+
+	ii = 0;
+
+	if(mna>0)
+		{
+		if(mna==1)
+			{
+			kernel_sgetr_1_lib4(0, n, nna, alpha, pA, pC, sdc);
+			if(nna!=1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1;
+				tna = (bs-(offsetC+1)%bs)%bs;
+				}
+			else //if(nna==1)
+				{
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+1)%bs)%bs;
+				}
+//			kernel_sgetr_1_lib4(0, n-1, tna, alpha, pA, pC, sdc);
+			}
+		else if(mna==2)
+			{
+			if(nna==0 || nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = (bs-(offsetC+2)%bs)%bs;
+				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += 1*bs;
+				pC += 1 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+				kernel_sgetr_2_lib4(0, n-1, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3; //(bs-(offsetC+2)%bs)%bs;
+//				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+				tna = 0; //(bs-(offsetC+2)%bs)%bs;
+				kernel_sgetr_2_lib4(0, n-2, tna, alpha, pA, pC, sdc);
+				}
+			}
+		else //if(mna==3)
+			{
+			if(nna==0)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3;
+				tna = 1;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==1)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pA += bs;
+				pC += 1 + (sdc-1)*bs;
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[0+bs*1] = alpha * pA[1+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[1+bs*2] = alpha * pA[2+bs*1];
+				pA += 2*bs;
+				pC += 2;
+				tna = 2;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else if(nna==2)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pA += 2*bs;
+				pC += 2 + (sdc-1)*bs;
+//				pC[0+bs*0] = alpha * pA[0+bs*0];
+//				pC[0+bs*1] = alpha * pA[1+bs*0];
+//				pC[0+bs*2] = alpha * pA[2+bs*0];
+				kernel_sgetr_3_lib4(0, n-2, 0, alpha, pA, pC, sdc);
+				pA += 1*bs;
+				pC += 1;
+				tna = 3;
+//				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			else //if(nna==3)
+				{
+				pC[0+bs*0] = alpha * pA[0+bs*0];
+				pC[1+bs*0] = alpha * pA[0+bs*1];
+				pC[1+bs*1] = alpha * pA[1+bs*1];
+				pC[2+bs*0] = alpha * pA[0+bs*2];
+				pC[2+bs*1] = alpha * pA[1+bs*2];
+				pC[2+bs*2] = alpha * pA[2+bs*2];
+				pA += 3*bs;
+				pC += 3 + (sdc-1)*bs;
+				tna = 0;
+				kernel_sgetr_3_lib4(0, n-3, tna, alpha, pA, pC, sdc);
+				}
+			}
+		ii += mna;
+		pA += mna + bs*(sda-1);
+		pC += mna*bs;
+		}
+	for( ; ii<m-3; ii+=4)
+		{
+		if(tna==0)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[3+bs*0] = alpha * pA[0+bs*3];
+			pC[3+bs*1] = alpha * pA[1+bs*3];
+			pC[3+bs*2] = alpha * pA[2+bs*3];
+			pC[3+bs*3] = alpha * pA[3+bs*3];
+			pA += 4*bs;
+			pC += sdc*bs;
+			kernel_sgetr_4_lib4(0, n-ii-4, 0, alpha, pA, pC, sdc);
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pC[2+bs*3] = alpha * pA[3+bs*2];
+			pA += 3*bs;
+			pC += 3;
+			kernel_sgetr_4_lib4(0, n-ii-4, 1, alpha, pA, pC, sdc);
+			}
+		else if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			pC[1+bs*3] = alpha * pA[3+bs*1];
+			pA += 2*bs;
+			pC += 2;
+			kernel_sgetr_4_lib4(0, n-ii-4, 2, alpha, pA, pC, sdc);
+			}
+		else //if(tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			pA += 3*bs;
+			pC += 3 + (sdc-1)*bs;
+			kernel_sgetr_4_lib4(0, n-ii-3, 0, alpha, pA, pC, sdc);
+//			pC[0+bs*0] = alpha * pA[0+bs*0];
+//			pC[0+bs*1] = alpha * pA[1+bs*0];
+//			pC[0+bs*2] = alpha * pA[2+bs*0];
+//			pC[0+bs*3] = alpha * pA[3+bs*0];
+			pA += bs;
+			pC += 1;
+//			kernel_sgetr_4_lib4(0, n-ii-4, tna, alpha, pA, pC, sdc);
+			}
+		pA += bs*sda;
+		pC += bs*bs;
+		}
+
+	// clean-up at the end
+	if(ii==m)
+		return;
+	
+	if(m-ii==1)
+		{
+		pC[0+bs*0] = alpha * pA[0+bs*0];
+		}
+	else if(m-ii==2)
+		{
+		if(tna!=1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			}
+		else //if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			}
+		}
+	else if(m-ii==3)
+		{
+		if(tna==0 || tna==3)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[2+bs*0] = alpha * pA[0+bs*2];
+			pC[2+bs*1] = alpha * pA[1+bs*2];
+			pC[2+bs*2] = alpha * pA[2+bs*2];
+			}
+		else if(tna==1)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pA += bs;
+			pC += 1 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pC[1+bs*2] = alpha * pA[2+bs*1];
+			}
+		else //if(tna==2)
+			{
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[1+bs*0] = alpha * pA[0+bs*1];
+			pC[1+bs*1] = alpha * pA[1+bs*1];
+			pA += 2*bs;
+			pC += 2 + (sdc-1)*bs;
+			pC[0+bs*0] = alpha * pA[0+bs*0];
+			pC[0+bs*1] = alpha * pA[1+bs*0];
+			pC[0+bs*2] = alpha * pA[2+bs*0];
+			}
+		}
+		
+	return;
+
+	}
+
+
+
+// regularize diagonal 
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += reg;
+		pD[jj*sdd+(jj+1)*bs+1] += reg;
+		pD[jj*sdd+(jj+2)*bs+2] += reg;
+		pD[jj*sdd+(jj+3)*bs+3] += reg;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+	
+	}
+
+
+
+// insert vector to diagonal 
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = alpha*x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = alpha*x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] = alpha*x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] = alpha*x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] = alpha*x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert sqrt of vector to diagonal 
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] = sqrt(x[jj+0]);
+		pD[jj*sdd+(jj+1)*bs+1] = sqrt(x[jj+1]);
+		pD[jj*sdd+(jj+2)*bs+2] = sqrt(x[jj+2]);
+		pD[jj*sdd+(jj+3)*bs+3] = sqrt(x[jj+3]);
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+	
+	}
+
+
+
+// extract diagonal to vector 
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha * pD[jj*sdd+(jj+0)*bs+0];
+		x[jj+1] = alpha * pD[jj*sdd+(jj+1)*bs+1];
+		x[jj+2] = alpha * pD[jj*sdd+(jj+2)*bs+2];
+		x[jj+3] = alpha * pD[jj*sdd+(jj+3)*bs+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal 
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+(jj+0)*bs+0] += alpha * x[jj+0];
+		pD[jj*sdd+(jj+1)*bs+1] += alpha * x[jj+1];
+		pD[jj*sdd+(jj+2)*bs+2] += alpha * x[jj+2];
+		pD[jj*sdd+(jj+3)*bs+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+	
+	}
+
+
+
+// extract diagonal to vector, sparse formulation 
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation 
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+	
+	}
+
+
+
+// insert vector to row 
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+	{
+	
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] = alpha*x[jj+0];
+		pD[(jj+1)*bs] = alpha*x[jj+1];
+		pD[(jj+2)*bs] = alpha*x[jj+2];
+		pD[(jj+3)*bs] = alpha*x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] = alpha*x[jj];
+		}
+	
+	}
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+	{
+	
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[jj+0] = alpha*pD[(jj+0)*bs];
+		x[jj+1] = alpha*pD[(jj+1)*bs];
+		x[jj+2] = alpha*pD[(jj+2)*bs];
+		x[jj+3] = alpha*pD[(jj+3)*bs];
+		}
+	for(; jj<kmax; jj++)
+		{
+		x[jj] = alpha*pD[(jj)*bs];
+		}
+	
+	}
+
+
+
+// add scaled vector to row 
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[(jj+0)*bs] += alpha * x[jj+0];
+		pD[(jj+1)*bs] += alpha * x[jj+1];
+		pD[(jj+2)*bs] += alpha * x[jj+2];
+		pD[(jj+3)*bs] += alpha * x[jj+3];
+		}
+	for(; jj<kmax; jj++)
+		{
+		pD[(jj)*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// insert vector to row, sparse formulation 
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to row, sparse formulation 
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation 
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+	{
+
+	const int bs = 4;
+
+	int ii;
+	float tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+	
+	}
+
+
+
+// insert vector to column 
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] = x[jj+0];
+		pD[jj*sdd+1] = x[jj+1];
+		pD[jj*sdd+2] = x[jj+2];
+		pD[jj*sdd+3] = x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] = x[jj+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to column 
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[jj*sdd+0] += alpha * x[jj+0];
+		pD[jj*sdd+1] += alpha * x[jj+1];
+		pD[jj*sdd+2] += alpha * x[jj+2];
+		pD[jj*sdd+3] += alpha * x[jj+3];
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+ll] += alpha * x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 4;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+	const int bs = 4;
+
+	int ii;
+
+	float tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-3; ii+=4)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+	
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	const int bs = 4;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	const int bs = 4;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	const int bs = 4;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	float *B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-3; ii+=4)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			// update
+			B  += 4;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	float 	*B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-3; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*jj] = ptr_pA[0];
+			A[1+ii+lda*jj] = ptr_pA[1];
+			A[2+ii+lda*jj] = ptr_pA[2];
+			A[3+ii+lda*jj] = ptr_pA[3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector 
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-3; ii+=4)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	srowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	int offA = ai%bs;
+
+	// same alignment
+	ii = 0;
+	// clean up at the beginning
+	mna = (4-offA)%bs;
+	if(mna>0)
+		{
+		if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+			{
+			if(m==1)
+				{
+				kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+				return;
+				}
+			else //if(m==2 && mna==3)
+				{
+				kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+				return;
+				}
+			}
+		if(mna==1)
+			{
+			kernel_sgesc_1_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 1;
+			}
+		else if(mna==2)
+			{
+			kernel_sgesc_2_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 2;
+			}
+		else // if(mna==3)
+			{
+			kernel_sgesc_3_lib4(n, &alpha, pA+offA);
+			pA += 4*sda;
+			ii += 3;
+			}
+		}
+	// main loop
+	for(; ii<m-3; ii+=4)
+		{
+		kernel_sgesc_4_lib4(n, &alpha, pA);
+		pA += 4*sda;
+		}
+	// clean up at the end
+	if(ii<m)
+		{
+		if(m-ii==1)
+			kernel_sgesc_1_lib4(n, &alpha, pA);
+		else if(m-ii==2)
+			kernel_sgesc_2_lib4(n, &alpha, pA);
+		else // if(m-ii==3)
+			kernel_sgesc_3_lib4(n, &alpha, pA);
+		}
+
+	return;
+
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 4;
+
+	int mna, ii;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_0_lib4(n, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_0_lib4(n, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_3_lib4(n, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_2_lib4(n, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_1_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+1, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_0_lib4(n, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_3_lib4(n, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_3_lib4(n, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_2_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+2, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_0_lib4(n, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_2_lib4(n, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgecp_1_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgecp_2_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgecp_3_0_lib4(n, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgecp_4_3_lib4(n, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgecp_1_0_lib4(n, pA+3, pB);
+			else if(m-ii==2)
+				kernel_sgecp_2_3_lib4(n, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_sgecp_3_3_lib4(n, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	int ii, mna;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_0_lib4(ii, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_0_lib4(ii, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_1_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+1, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_0_lib4(ii, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_3_lib4(ii, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_2_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+2, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_0_lib4(ii, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_2_lib4(ii, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_strcp_l_1_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_strcp_l_2_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_strcp_l_3_0_lib4(ii, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_strcp_l_4_3_lib4(ii, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_strcp_l_1_0_lib4(ii, pA+3, pB);
+			else if(m-ii==2)
+				kernel_strcp_l_2_3_lib4(ii, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_strcp_l_3_3_lib4(ii, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offA = ai%bs;
+	int offB = bi%bs;
+
+	int ii, mna;
+
+	// same alignment
+	if(offA==offB)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_0_lib4(n, &alpha, pA, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_0_lib4(n, &alpha, pA, pB);
+			}
+		}
+	// skip one element of pA
+	else if(offA==(offB+1)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna) // mna<=3  ==>  m = { 1, 2 }
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else //if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				//pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+2);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_1_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA+1, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_0_lib4(n, &alpha, pA+1, pB);
+			}
+		}
+	// skip 2 elements of pA
+	else if(offA==(offB+2)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB+1);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+1, pB+3);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA, pB+2);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB+1);
+				pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_2_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+2, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_0_lib4(n, &alpha, pA+2, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_2_lib4(n, &alpha, pA, sda, pB);
+			}
+		}
+	// skip 3 elements of pA
+	else // if(offA==(offB+3)%bs)
+		{
+		ii = 0;
+		// clean up at the beginning
+		mna = (4-offB)%bs;
+		if(mna>0)
+			{
+			if(m<mna)
+				{
+				if(m==1)
+					{
+					kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				else // if(m==2 && mna==3)
+					{
+					kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+					return;
+					}
+				}
+			if(mna==1)
+				{
+				kernel_sgead_1_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 1;
+				}
+			else if(mna==2)
+				{
+				kernel_sgead_2_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 2;
+				}
+			else // if(mna==3)
+				{
+				kernel_sgead_3_0_lib4(n, &alpha, pA+offA, pB+offB);
+				// pA += 4*sda;
+				pB += 4*sdb;
+				ii += 3;
+				}
+			}
+		// main loop
+		for(; ii<m-3; ii+=4)
+			{
+			kernel_sgead_4_3_lib4(n, &alpha, pA, sda, pB);
+			pA += 4*sda;
+			pB += 4*sdb;
+			}
+		// clean up at the end
+		if(ii<m)
+			{
+			if(m-ii==1)
+				kernel_sgead_1_0_lib4(n, &alpha, pA+3, pB);
+			else if(m-ii==2)
+				kernel_sgead_2_3_lib4(n, &alpha, pA, sda, pB);
+			else // if(m-ii==3)
+				kernel_sgead_3_3_lib4(n, &alpha, pA, sda, pB);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	sgetr_lib(m, n, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 4;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation 
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation 
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation 
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation 
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation 
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 4;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	srowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	svecad_libsp(kmax, idx, alpha, x, y);
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm)
+	{
+	int ii;
+	float *x = sx->pa + xi;
+	float norm = 0.0;
+	for(ii=0; ii<m; ii++)
+		norm = fmax(norm, fabs(x[ii]));
+	*ptr_norm = norm;
+	return;
+	}
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/auxiliary/s_aux_lib8.c b/auxiliary/s_aux_lib8.c
new file mode 100644
index 0000000..94ba22d
--- /dev/null
+++ b/auxiliary/s_aux_lib8.c
@@ -0,0 +1,2647 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_block_size.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+// copies a lower triangular packed matrix into a lower triangular packed matrix
+void strcp_l_lib(int m, int offsetA, float *A, int sda, int offsetB, float *B, int sdb)
+	{
+	printf("\nstrcp_;l_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// scales and adds a strvec into a strvec
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		pc[ii+1] += alpha*pa[ii+1];
+		pc[ii+2] += alpha*pa[ii+2];
+		pc[ii+3] += alpha*pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] += alpha*pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// transpose lower triangular matrix
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+	printf("\nstrtr_l_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// transpose an aligned upper triangular matrix into an aligned lower triangular matrix
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+	printf("\nstrtr_u_lib: feature not implemented yet\n");
+	exit(1);
+	}
+
+
+
+// regularize diagonal 
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += reg;
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+0*bs] += reg;
+		pD2[1+1*bs] += reg;
+		pD2[2+2*bs] += reg;
+		pD2[3+3*bs] += reg;
+		pD2[4+4*bs] += reg;
+		pD2[5+5*bs] += reg;
+		pD2[6+6*bs] += reg;
+		pD2[7+7*bs] += reg;
+		pD2 += bs*sdd+bs*bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += reg;
+		}
+	
+	}
+
+
+
+// insert vector to diagonal 
+void sdiain_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = alpha*x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] = alpha*x2[0];
+		pD2[1+bs*1] = alpha*x2[1];
+		pD2[2+bs*2] = alpha*x2[2];
+		pD2[3+bs*3] = alpha*x2[3];
+		pD2[4+bs*4] = alpha*x2[4];
+		pD2[5+bs*5] = alpha*x2[5];
+		pD2[6+bs*6] = alpha*x2[6];
+		pD2[7+bs*7] = alpha*x2[7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = alpha*x[jj+ll];
+		}
+	
+	}
+
+
+
+// insert sqrt of vector to diagonal 
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] = sqrt(x[ll]);
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] = sqrt(x2[0]);
+		pD2[1+bs*1] = sqrt(x2[1]);
+		pD2[2+bs*2] = sqrt(x2[2]);
+		pD2[3+bs*3] = sqrt(x2[3]);
+		pD2[4+bs*4] = sqrt(x2[4]);
+		pD2[5+bs*5] = sqrt(x2[5]);
+		pD2[5+bs*6] = sqrt(x2[6]);
+		pD2[7+bs*7] = sqrt(x2[7]);
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] = sqrt(x[jj+ll]);
+		}
+	
+	}
+
+
+
+// extract diagonal to vector 
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			x[ll] = alpha * pD[ll+bs*ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		x2[0] = alpha * pD2[0+bs*0];
+		x2[1] = alpha * pD2[1+bs*1];
+		x2[2] = alpha * pD2[2+bs*2];
+		x2[3] = alpha * pD2[3+bs*3];
+		x2[4] = alpha * pD2[4+bs*4];
+		x2[5] = alpha * pD2[5+bs*5];
+		x2[6] = alpha * pD2[6+bs*6];
+		x2[7] = alpha * pD2[7+bs*7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[jj+ll] = alpha * pD[jj*sdd+(jj+ll)*bs+ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal 
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	float *pD2, *x2;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll+bs*ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1) + kna*bs;
+		x  += kna;
+		kmax -= kna;
+		}
+	pD2 = pD;
+	x2 = x;
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD2[0+bs*0] += alpha * x2[0];
+		pD2[1+bs*1] += alpha * x2[1];
+		pD2[2+bs*2] += alpha * x2[2];
+		pD2[3+bs*3] += alpha * x2[3];
+		pD2[4+bs*4] += alpha * x2[4];
+		pD2[5+bs*5] += alpha * x2[5];
+		pD2[6+bs*6] += alpha * x2[6];
+		pD2[7+bs*7] += alpha * x2[7];
+		pD2 += bs*sdd+bs*bs;
+		x2 += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[jj*sdd+(jj+ll)*bs+ll] += alpha * x[jj+ll];
+		}
+	return;
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract diagonal to vector, sparse formulation 
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[ii/bs*bs*sdd+ii%bs+ii*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to another vector and insert to diagonal, sparse formulation 
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs+ii*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// insert vector to row 
+void srowin_lib(int kmax, float alpha, float *x, float *pD)
+	{
+	
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[0*bs] = alpha * x[0];
+		pD[1*bs] = alpha * x[1];
+		pD[2*bs] = alpha * x[2];
+		pD[3*bs] = alpha * x[3];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll*bs] = alpha*x[ll];
+		}
+	return;
+	}
+
+
+
+// extract row to vector
+void srowex_lib(int kmax, float alpha, float *pD, float *x)
+	{
+	
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		x[0] = alpha * pD[0*bs];
+		x[1] = alpha * pD[1*bs];
+		x[2] = alpha * pD[2*bs];
+		x[3] = alpha * pD[3*bs];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		x[ll] = alpha*pD[ll*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to row 
+void srowad_lib(int kmax, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int jj, ll;
+
+	for(jj=0; jj<kmax-3; jj+=4)
+		{
+		pD[0*bs] += alpha * x[0];
+		pD[1*bs] += alpha * x[1];
+		pD[2*bs] += alpha * x[2];
+		pD[3*bs] += alpha * x[3];
+		pD += 4*bs;
+		x += 4;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll*bs] += alpha * x[ll];
+		}
+	return;
+	}
+
+
+
+// insert vector to row, sparse formulation 
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = alpha*x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to row, sparse formulation 
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled vector to another vector and insert to row, sparse formulation 
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// swap two rows
+void srowsw_lib(int kmax, float *pA, float *pC)
+	{
+
+	const int bs = 8;
+
+	int ii;
+	float tmp;
+
+	for(ii=0; ii<kmax-3; ii+=4)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		tmp = pA[0+bs*1];
+		pA[0+bs*1] = pC[0+bs*1];
+		pC[0+bs*1] = tmp;
+		tmp = pA[0+bs*2];
+		pA[0+bs*2] = pC[0+bs*2];
+		pC[0+bs*2] = tmp;
+		tmp = pA[0+bs*3];
+		pA[0+bs*3] = pC[0+bs*3];
+		pC[0+bs*3] = tmp;
+		pA += 4*bs;
+		pC += 4*bs;
+		}
+	for( ; ii<kmax; ii++)
+		{
+		tmp = pA[0+bs*0];
+		pA[0+bs*0] = pC[0+bs*0];
+		pC[0+bs*0] = tmp;
+		pA += 1*bs;
+		pC += 1*bs;
+		}
+	return;
+	}
+
+
+
+// insert vector to column 
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] = x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD[0] = x[0];
+		pD[1] = x[1];
+		pD[2] = x[2];
+		pD[3] = x[3];
+		pD[4] = x[4];
+		pD[5] = x[5];
+		pD[6] = x[6];
+		pD[7] = x[7];
+		pD += bs*sdd;
+		x += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll] = x[ll];
+		}
+	
+	}
+
+
+
+// add scaled vector to column 
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int kna = (bs-offset%bs)%bs;
+	kna = kmax<kna ? kmax : kna;
+
+	int jj, ll;
+
+	if(kna>0)
+		{
+		for(ll=0; ll<kna; ll++)
+			{
+			pD[ll] += alpha * x[ll];
+			}
+		pD += kna + bs*(sdd-1);
+		x  += kna;
+		kmax -= kna;
+		}
+	for(jj=0; jj<kmax-7; jj+=8)
+		{
+		pD[0] += alpha * x[0];
+		pD[1] += alpha * x[1];
+		pD[2] += alpha * x[2];
+		pD[3] += alpha * x[3];
+		pD[4] += alpha * x[4];
+		pD[5] += alpha * x[5];
+		pD[6] += alpha * x[6];
+		pD[7] += alpha * x[7];
+		pD += bs*sdd;
+		x += bs;
+		}
+	for(ll=0; ll<kmax-jj; ll++)
+		{
+		pD[ll] += alpha * x[ll];
+		}
+	
+	}
+
+
+
+// insert vector to diagonal, sparse formulation 
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] = x[jj];
+		}
+	
+	}
+
+
+
+// add scaled vector to diagonal, sparse formulation 
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd)
+	{
+
+	const int bs = 8;
+
+	int ii, jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[ii/bs*bs*sdd+ii%bs] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+// swaps two cols
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc)
+	{
+
+	const int bs = 8;
+
+	int ii;
+
+	float tmp;
+
+	if(offsetA==offsetC)
+		{
+		if(offsetA>0)
+			{
+			ii = 0;
+			for(; ii<bs-offsetA; ii++)
+				{
+				tmp = pA[0+bs*0];
+				pA[0+bs*0] = pC[0+bs*0];
+				pC[0+bs*0] = tmp;
+				pA += 1;
+				pC += 1;
+				}
+			pA += bs*(sda-1);
+			pC += bs*(sdc-1);
+			kmax -= bs-offsetA;
+			}
+		ii = 0;
+		for(; ii<kmax-7; ii+=8)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			tmp = pA[1+bs*0];
+			pA[1+bs*0] = pC[1+bs*0];
+			pC[1+bs*0] = tmp;
+			tmp = pA[2+bs*0];
+			pA[2+bs*0] = pC[2+bs*0];
+			pC[2+bs*0] = tmp;
+			tmp = pA[3+bs*0];
+			pA[3+bs*0] = pC[3+bs*0];
+			pC[3+bs*0] = tmp;
+			tmp = pA[4+bs*0];
+			pA[4+bs*0] = pC[4+bs*0];
+			pC[4+bs*0] = tmp;
+			tmp = pA[5+bs*0];
+			pA[5+bs*0] = pC[5+bs*0];
+			pC[5+bs*0] = tmp;
+			tmp = pA[6+bs*0];
+			pA[6+bs*0] = pC[6+bs*0];
+			pC[6+bs*0] = tmp;
+			tmp = pA[7+bs*0];
+			pA[7+bs*0] = pC[7+bs*0];
+			pC[7+bs*0] = tmp;
+			pA += bs*sda;
+			pC += bs*sdc;
+			}
+		for(; ii<kmax; ii++)
+			{
+			tmp = pA[0+bs*0];
+			pA[0+bs*0] = pC[0+bs*0];
+			pC[0+bs*0] = tmp;
+			pA += 1;
+			pC += 1;
+			}
+		}
+	else
+		{
+		printf("\nscolsw: feature not implemented yet: offsetA!=offsetC\n\n");
+		exit(1);
+		}
+
+	return;
+
+	}
+
+
+
+// insert vector to vector, sparse formulation
+void svecin_libsp(int kmax, int *idx, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] = x[jj];
+		}
+	
+	}
+
+
+
+// adds vector to vector, sparse formulation
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y)
+	{
+
+	int jj;
+
+	for(jj=0; jj<kmax; jj++)
+		{
+		y[idx[jj]] += alpha * x[jj];
+		}
+	
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// return the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = (pm*cn+tmp)*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// return the memory size (in bytes) needed for the digonal of a strmat
+int s_size_diag_strmat(int m, int n)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	int memory_size = tmp*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a matrix structure for a matrix of size m*n by using memory passed by a pointer
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory)
+	{
+	const int bs = 8;
+	int nc = S_NC;
+	int al = bs*nc;
+	sA->m = m;
+	sA->n = n;
+	int pm = (m+bs-1)/bs*bs;
+	int cn = (n+nc-1)/nc*nc;
+	sA->pm = pm;
+	sA->cn = cn;
+	float *ptr = (float *) memory;
+	sA->pA = ptr;
+	ptr += pm*cn;
+	int tmp = m<n ? (m+al-1)/al*al : (n+al-1)/al*al; // al(min(m,n)) // XXX max ???
+	sA->dA = ptr;
+	ptr += tmp;
+	sA->use_dA = 0;
+	sA->memory_size = (pm*cn+tmp)*sizeof(float);
+	return;
+	}
+
+
+
+// return memory size (in bytes) needed for a strvec
+int s_size_strvec(int m)
+	{
+	const int bs = 8;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	int pm = (m+bs-1)/bs*bs;
+	int memory_size = pm*sizeof(float);
+	return memory_size;
+	}
+
+
+
+// create a vector structure for a vector of size m by using memory passed by a pointer
+void s_create_strvec(int m, struct s_strvec *sa, void *memory)
+	{
+	const int bs = 8;
+//	int nc = S_NC;
+//	int al = bs*nc;
+	sa->m = m;
+	int pm = (m+bs-1)/bs*bs;
+	sa->pm = pm;
+	float *ptr = (float *) memory;
+	sa->pa = ptr;
+//	ptr += pm;
+	sa->memory_size = pm*sizeof(float);
+	return;
+	}
+
+
+
+// convert a matrix into a matrix structure
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, jj, m0, m1, m2;
+	float *B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>m)
+		m0 = m;
+	m1 = m - m0;
+	jj = 0;
+	for( ; jj<n-3; jj+=4)
+		{
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				pB[ii+bs*1] = B[ii+lda*1];
+				pB[ii+bs*2] = B[ii+lda*2];
+				pB[ii+bs*3] = B[ii+lda*3];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-7; ii+=8)
+			{
+			// unroll 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			pB[4+bs*0] = B[4+lda*0];
+			pB[5+bs*0] = B[5+lda*0];
+			pB[6+bs*0] = B[6+lda*0];
+			pB[7+bs*0] = B[7+lda*0];
+			// unroll 1
+			pB[0+bs*1] = B[0+lda*1];
+			pB[1+bs*1] = B[1+lda*1];
+			pB[2+bs*1] = B[2+lda*1];
+			pB[3+bs*1] = B[3+lda*1];
+			pB[4+bs*1] = B[4+lda*1];
+			pB[5+bs*1] = B[5+lda*1];
+			pB[6+bs*1] = B[6+lda*1];
+			pB[7+bs*1] = B[7+lda*1];
+			// unroll 2
+			pB[0+bs*2] = B[0+lda*2];
+			pB[1+bs*2] = B[1+lda*2];
+			pB[2+bs*2] = B[2+lda*2];
+			pB[3+bs*2] = B[3+lda*2];
+			pB[4+bs*2] = B[4+lda*2];
+			pB[5+bs*2] = B[5+lda*2];
+			pB[6+bs*2] = B[6+lda*2];
+			pB[7+bs*2] = B[7+lda*2];
+			// unroll 3
+			pB[0+bs*3] = B[0+lda*3];
+			pB[1+bs*3] = B[1+lda*3];
+			pB[2+bs*3] = B[2+lda*3];
+			pB[3+bs*3] = B[3+lda*3];
+			pB[4+bs*3] = B[4+lda*3];
+			pB[5+bs*3] = B[5+lda*3];
+			pB[6+bs*3] = B[6+lda*3];
+			pB[7+bs*3] = B[7+lda*3];
+			// update
+			B  += 8;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// col 1
+			pB[0+bs*1] = B[0+lda*1];
+			// col 2
+			pB[0+bs*2] = B[0+lda*2];
+			// col 3
+			pB[0+bs*3] = B[0+lda*3];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	for( ; jj<n; jj++)
+		{
+
+		B  =  A + jj*lda;
+		pB = pA + jj*bs;
+
+		ii = 0;
+		if(m0>0)
+			{
+			for( ; ii<m0; ii++)
+				{
+				pB[ii+bs*0] = B[ii+lda*0];
+				}
+			B  += m0;
+			pB += m0 + bs*(sda-1);
+			}
+		for( ; ii<m-7; ii+=8)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			pB[1+bs*0] = B[1+lda*0];
+			pB[2+bs*0] = B[2+lda*0];
+			pB[3+bs*0] = B[3+lda*0];
+			pB[4+bs*0] = B[4+lda*0];
+			pB[5+bs*0] = B[5+lda*0];
+			pB[6+bs*0] = B[6+lda*0];
+			pB[7+bs*0] = B[7+lda*0];
+			// update
+			B  += 8;
+			pB += bs*sda;
+			}
+		for( ; ii<m; ii++)
+			{
+			// col 0
+			pB[0+bs*0] = B[0+lda*0];
+			// update
+			B  += 1;
+			pB += 1;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix into a matrix structure
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, j, m0, m1, m2;
+	float 	*B, *pB;
+	m0 = (bs-ai%bs)%bs;
+	if(m0>n)
+		m0 = n;
+	m1 = n - m0;
+	ii = 0;
+	if(m0>0)
+		{
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m0; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		A  += m0*lda;
+		pA += m0 + bs*(sda-1);
+		}
+	ii = 0;
+	for(; ii<m1-7; ii+=bs)
+		{
+		j=0;
+		B  = A + ii*lda;
+		pB = pA + ii*sda;
+		for(; j<m-3; j+=4)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			pB[4+0*bs] = B[0+4*lda];
+			pB[5+0*bs] = B[0+5*lda];
+			pB[6+0*bs] = B[0+6*lda];
+			pB[7+0*bs] = B[0+7*lda];
+			// unroll 1
+			pB[0+1*bs] = B[1+0*lda];
+			pB[1+1*bs] = B[1+1*lda];
+			pB[2+1*bs] = B[1+2*lda];
+			pB[3+1*bs] = B[1+3*lda];
+			pB[4+1*bs] = B[1+4*lda];
+			pB[5+1*bs] = B[1+5*lda];
+			pB[6+1*bs] = B[1+6*lda];
+			pB[7+1*bs] = B[1+7*lda];
+			// unroll 2
+			pB[0+2*bs] = B[2+0*lda];
+			pB[1+2*bs] = B[2+1*lda];
+			pB[2+2*bs] = B[2+2*lda];
+			pB[3+2*bs] = B[2+3*lda];
+			pB[4+2*bs] = B[2+4*lda];
+			pB[5+2*bs] = B[2+5*lda];
+			pB[6+2*bs] = B[2+6*lda];
+			pB[7+2*bs] = B[2+7*lda];
+			// unroll 3
+			pB[0+3*bs] = B[3+0*lda];
+			pB[1+3*bs] = B[3+1*lda];
+			pB[2+3*bs] = B[3+2*lda];
+			pB[3+3*bs] = B[3+3*lda];
+			pB[4+3*bs] = B[3+4*lda];
+			pB[5+3*bs] = B[3+5*lda];
+			pB[6+3*bs] = B[3+6*lda];
+			pB[7+3*bs] = B[3+7*lda];
+			B  += 4;
+			pB += 4*bs;
+			}
+		for(; j<m; j++)
+			{
+			// unroll 0
+			pB[0+0*bs] = B[0+0*lda];
+			pB[1+0*bs] = B[0+1*lda];
+			pB[2+0*bs] = B[0+2*lda];
+			pB[3+0*bs] = B[0+3*lda];
+			pB[4+0*bs] = B[0+4*lda];
+			pB[5+0*bs] = B[0+5*lda];
+			pB[6+0*bs] = B[0+6*lda];
+			pB[7+0*bs] = B[0+7*lda];
+			B  += 1;
+			pB += 1*bs;
+			}
+		}
+	if(ii<m1)
+		{
+		m2 = m1-ii;
+		if(bs<m2) m2 = bs;
+		for(j=0; j<m; j++)
+			{
+			for(i=0; i<m2; i++)
+				{
+				pA[i+j*bs+ii*sda] = A[j+(i+ii)*lda];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector into a vector structure
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		pa[ii] = a[ii];
+	return;
+	}
+
+
+
+// convert a matrix structure into a matrix
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		// TODO update A !!!!!
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+			A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+			A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+			A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+			// unroll 0
+			A[0+ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			A[1+ii+lda*(jj+1)] = ptr_pA[1+bs*1];
+			A[2+ii+lda*(jj+1)] = ptr_pA[2+bs*1];
+			A[3+ii+lda*(jj+1)] = ptr_pA[3+bs*1];
+			A[4+ii+lda*(jj+1)] = ptr_pA[4+bs*1];
+			A[5+ii+lda*(jj+1)] = ptr_pA[5+bs*1];
+			A[6+ii+lda*(jj+1)] = ptr_pA[6+bs*1];
+			A[7+ii+lda*(jj+1)] = ptr_pA[7+bs*1];
+			// unroll 0
+			A[0+ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			A[1+ii+lda*(jj+2)] = ptr_pA[1+bs*2];
+			A[2+ii+lda*(jj+2)] = ptr_pA[2+bs*2];
+			A[3+ii+lda*(jj+2)] = ptr_pA[3+bs*2];
+			A[4+ii+lda*(jj+2)] = ptr_pA[4+bs*2];
+			A[5+ii+lda*(jj+2)] = ptr_pA[5+bs*2];
+			A[6+ii+lda*(jj+2)] = ptr_pA[6+bs*2];
+			A[7+ii+lda*(jj+2)] = ptr_pA[7+bs*2];
+			// unroll 0
+			A[0+ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			A[1+ii+lda*(jj+3)] = ptr_pA[1+bs*3];
+			A[2+ii+lda*(jj+3)] = ptr_pA[2+bs*3];
+			A[3+ii+lda*(jj+3)] = ptr_pA[3+bs*3];
+			A[4+ii+lda*(jj+3)] = ptr_pA[4+bs*3];
+			A[5+ii+lda*(jj+3)] = ptr_pA[5+bs*3];
+			A[6+ii+lda*(jj+3)] = ptr_pA[6+bs*3];
+			A[7+ii+lda*(jj+3)] = ptr_pA[7+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[ii+lda*(jj+1)] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[ii+lda*(jj+2)] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[ii+lda*(jj+3)] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[ii+lda*jj] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			A[0+ii+lda*(jj+0)] = ptr_pA[0+bs*0];
+			A[1+ii+lda*(jj+0)] = ptr_pA[1+bs*0];
+			A[2+ii+lda*(jj+0)] = ptr_pA[2+bs*0];
+			A[3+ii+lda*(jj+0)] = ptr_pA[3+bs*0];
+			A[4+ii+lda*(jj+0)] = ptr_pA[4+bs*0];
+			A[5+ii+lda*(jj+0)] = ptr_pA[5+bs*0];
+			A[6+ii+lda*(jj+0)] = ptr_pA[6+bs*0];
+			A[7+ii+lda*(jj+0)] = ptr_pA[7+bs*0];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[ii+lda*jj] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert and transpose a matrix structure into a matrix
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	int i, ii, jj;
+	int m0 = (bs-ai%bs)%bs;
+	float *ptr_pA;
+	jj=0;
+	for(; jj<n-3; jj+=4)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				// unroll 0
+				A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+				// unroll 1
+				A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+				// unroll 2
+				A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+				// unroll 3
+				A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		// TODO update A !!!!!
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			// unroll 0
+			A[jj+0+lda*(ii+0)] = ptr_pA[0+bs*0];
+			A[jj+0+lda*(ii+1)] = ptr_pA[1+bs*0];
+			A[jj+0+lda*(ii+2)] = ptr_pA[2+bs*0];
+			A[jj+0+lda*(ii+3)] = ptr_pA[3+bs*0];
+			A[jj+0+lda*(ii+4)] = ptr_pA[4+bs*0];
+			A[jj+0+lda*(ii+5)] = ptr_pA[5+bs*0];
+			A[jj+0+lda*(ii+6)] = ptr_pA[6+bs*0];
+			A[jj+0+lda*(ii+7)] = ptr_pA[7+bs*0];
+			// unroll 1
+			A[jj+1+lda*(ii+0)] = ptr_pA[0+bs*1];
+			A[jj+1+lda*(ii+1)] = ptr_pA[1+bs*1];
+			A[jj+1+lda*(ii+2)] = ptr_pA[2+bs*1];
+			A[jj+1+lda*(ii+3)] = ptr_pA[3+bs*1];
+			A[jj+1+lda*(ii+4)] = ptr_pA[4+bs*1];
+			A[jj+1+lda*(ii+5)] = ptr_pA[5+bs*1];
+			A[jj+1+lda*(ii+6)] = ptr_pA[6+bs*1];
+			A[jj+1+lda*(ii+7)] = ptr_pA[7+bs*1];
+			// unroll 2
+			A[jj+2+lda*(ii+0)] = ptr_pA[0+bs*2];
+			A[jj+2+lda*(ii+1)] = ptr_pA[1+bs*2];
+			A[jj+2+lda*(ii+2)] = ptr_pA[2+bs*2];
+			A[jj+2+lda*(ii+3)] = ptr_pA[3+bs*2];
+			A[jj+2+lda*(ii+4)] = ptr_pA[4+bs*2];
+			A[jj+2+lda*(ii+5)] = ptr_pA[5+bs*2];
+			A[jj+2+lda*(ii+6)] = ptr_pA[6+bs*2];
+			A[jj+2+lda*(ii+7)] = ptr_pA[7+bs*2];
+			// unroll 3
+			A[jj+3+lda*(ii+0)] = ptr_pA[0+bs*3];
+			A[jj+3+lda*(ii+1)] = ptr_pA[1+bs*3];
+			A[jj+3+lda*(ii+2)] = ptr_pA[2+bs*3];
+			A[jj+3+lda*(ii+3)] = ptr_pA[3+bs*3];
+			A[jj+3+lda*(ii+4)] = ptr_pA[4+bs*3];
+			A[jj+3+lda*(ii+5)] = ptr_pA[5+bs*3];
+			A[jj+3+lda*(ii+6)] = ptr_pA[6+bs*3];
+			A[jj+3+lda*(ii+7)] = ptr_pA[7+bs*3];
+			ptr_pA += sda*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			// unroll 0
+			A[jj+0+lda*ii] = ptr_pA[0+bs*0];
+			// unroll 1
+			A[jj+1+lda*ii] = ptr_pA[0+bs*1];
+			// unroll 2
+			A[jj+2+lda*ii] = ptr_pA[0+bs*2];
+			// unroll 3
+			A[jj+3+lda*ii] = ptr_pA[0+bs*3];
+			ptr_pA++;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ptr_pA = pA + jj*bs;
+		ii = 0;
+		if(m0>0)
+			{
+			for(; ii<m0; ii++)
+				{
+				A[jj+lda*ii] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m-bs+1; ii+=bs)
+			{
+			i=0;
+			// TODO update A !!!!!
+			// TODO unroll !!!!!!
+			for(; i<bs; i++)
+				{
+				A[jj+lda*(i+ii)] = ptr_pA[0];
+				ptr_pA++;
+				}
+			ptr_pA += (sda-1)*bs;
+			}
+		for(; ii<m; ii++)
+			{
+			A[jj+lda*ii] = ptr_pA[0];
+			ptr_pA++;
+			}
+		}
+	return;
+	}
+
+
+
+// convert a vector structure into a vector 
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		a[ii] = pa[ii];
+	return;
+	}
+
+
+
+// cast a matrix into a matrix structure
+void s_cast_mat2strmat(float *A, struct s_strmat *sA)
+	{
+	sA->pA = A;
+	return;
+	}
+
+
+
+// cast a matrix into the diagonal of a matrix structure
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA)
+	{
+	sA->dA = dA;
+	return;
+	}
+
+
+
+// cast a vector into a vector structure
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa)
+	{
+	sa->pa = a;
+	return;
+	}
+
+
+
+// insert element into strmat
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	pA[0] = a;
+	return;
+	}
+
+
+
+// extract element from strmat
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	return pA[0];
+	}
+
+
+
+// insert element into strvec
+void svecin1_libstr(float a, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	x[0] = a;
+	return;
+	}
+
+
+
+// extract element from strvec
+float svecex1_libstr(struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	return x[0];
+	}
+
+
+
+// set all elements of a strmat to a value
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai%bs + ai/bs*bs*sda + aj*bs;
+	int m0 = m<(bs-ai%bs)%bs ? m : (bs-ai%bs)%bs;
+	int ii, jj;
+	if(m0>0)
+		{
+		for(ii=0; ii<m0; ii++)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				pA[jj*bs] = alpha;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		m -= m0;
+		}
+	for(ii=0; ii<m-7; ii+=8)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[0+jj*bs] = alpha;
+			pA[1+jj*bs] = alpha;
+			pA[2+jj*bs] = alpha;
+			pA[3+jj*bs] = alpha;
+			pA[4+jj*bs] = alpha;
+			pA[5+jj*bs] = alpha;
+			pA[6+jj*bs] = alpha;
+			pA[7+jj*bs] = alpha;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<m; ii++)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			pA[jj*bs] = alpha;
+			}
+		pA += 1;
+		}
+	return;
+	}
+
+
+
+// set all elements of a strvec to a value
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi)
+	{
+	float *x = sx->pa + xi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		x[ii] = alpha;
+	return;
+	}
+
+
+
+// extract diagonal to vector
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiaex_lib(kmax, alpha, ai%bs, pA, sda, x);
+	return;
+	}
+
+
+
+// insert a vector into diagonal
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	sdiain_lib(kmax, alpha, x, ai%bs, pA, sda);
+	return;
+	}
+
+
+
+// swap two rows of a matrix struct
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	srowsw_lib(kmax, pA, pC);
+	return;
+	}
+
+
+
+// permute the rows of a matrix struct
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			srowsw_libstr(sA->n, sA, ii, 0, sA, ipiv[ii], 0);
+		}
+	return;
+	}
+
+
+// extract a row int a vector
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowex_lib(kmax, alpha, pA, x);
+	return;
+	}
+
+
+
+// insert a vector into a row
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowin_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// add a vector to a row
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	float *x = sx->pa + xi;
+	srowad_lib(kmax, alpha, x, pA);
+	return;
+	}
+
+
+
+// swap two cols of a matrix struct
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	scolsw_lib(kmax, ai%bs, pA, sda, ci%bs, pC, sdc);
+	return;
+	}
+
+
+
+// permute the cols of a matrix struct
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA)
+	{
+	int ii;
+	for(ii=0; ii<kmax; ii++)
+		{
+		if(ipiv[ii]!=ii)
+			scolsw_libstr(sA->m, sA, 0, ii, sA, 0, ipiv[ii]);
+		}
+	return;
+	}
+
+
+
+// scale a generic strmat
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgesc_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgesc_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgesc_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgesc_libstr : aj<0 : %d<0 *****\n", aj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgesc_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgesc_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	int offsetA = ai%bs;
+
+	int ii, mna;
+
+	if(offsetA>0)
+		{
+		mna = bs-offsetA;
+		mna = m<mna ? m : mna;
+		kernel_sgesc_8_gen_lib8(n, &alpha, &pA[offsetA], mna);
+		m -= mna;
+		pA += 8*sda;
+		}
+	ii = 0;
+	for( ; ii<m-7; ii+=8)
+		{
+		kernel_sgesc_8_lib8(n, &alpha, &pA[0]);
+		pA += 8*sda;
+		}
+	if(ii<m)
+		{
+		kernel_sgesc_8_gen_lib8(n, &alpha, &pA[0], m-ii);
+		}
+
+	return;
+
+	}
+
+
+
+// copy a generic strmat into a generic strmat
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgecp_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgecp_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgecp_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgecp_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgecp_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgecp_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgecp_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgecp_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgecp_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgecp_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, mna;
+
+#if 1
+	if(offsetB>0)
+		{
+		if(offsetB>offsetA)
+			{
+			mna = bs-offsetB;
+			mna = m<mna ? m : mna;
+			kernel_sgecp_8_0_gen_lib8(n, &pA[offsetA], &pB[offsetB], mna);
+			m -= mna;
+			//pA += 8*sda;
+			pB += 8*sdb;
+			}
+		else
+			{
+			if(offsetA==0)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_0_gen_lib8(n, &pA[0], &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==1)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_1_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==2)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_2_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==3)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_3_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==4)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_4_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==5)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_5_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==6)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_6_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==7)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgecp_8_7_gen_lib8(n, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			}
+		}
+#endif
+
+	// same alignment
+	if(offsetA==offsetB)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_0_lib8(n, pA, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_0_gen_lib8(n, pA, pB, m-ii);
+			}
+		return;
+		}
+	// XXX different alignment: search tree ???
+	// skip one element of A
+	else if(offsetA==(offsetB+1)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_1_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_1_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		}
+	// skip two elements of A
+	else if(offsetA==(offsetB+2)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_2_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_2_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip three elements of A
+	else if(offsetA==(offsetB+3)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_3_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_3_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip four elements of A
+	else if(offsetA==(offsetB+4)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_4_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_4_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip five elements of A
+	else if(offsetA==(offsetB+5)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_5_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_5_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip six elements of A
+	else if(offsetA==(offsetB+6)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_6_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_6_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip seven elements of A
+	else //if(offsetA==(offsetB+7)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgecp_8_7_lib8(n, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgecp_8_7_gen_lib8(n, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	
+	return;
+
+	}
+
+
+
+// scale a strvec
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai)
+	{
+	float *pa = sa->pa + ai;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pa[ii+0] *= alpha;
+		pa[ii+1] *= alpha;
+		pa[ii+2] *= alpha;
+		pa[ii+3] *= alpha;
+		}
+	for(; ii<m; ii++)
+		{
+		pa[ii+0] *= alpha;
+		}
+	return;
+	}
+
+
+
+// copy a strvec into a strvec
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci)
+	{
+	float *pa = sa->pa + ai;
+	float *pc = sc->pa + ci;
+	int ii;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		pc[ii+0] = pa[ii+0];
+		pc[ii+1] = pa[ii+1];
+		pc[ii+2] = pa[ii+2];
+		pc[ii+3] = pa[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		pc[ii+0] = pa[ii+0];
+		}
+	return;
+	}
+
+
+
+// copy a lower triangular strmat into a lower triangular strmat
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strcp_l_lib(m, ai%bs, pA, sda, ci%bs, pC, sdc);
+	// XXX uses full matrix copy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//	sgecp_libstr(m, m, sA, ai, aj, sC, ci, cj);
+	return;
+	}
+
+
+
+// scale and add a generic strmat into a generic strmat
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgead_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgead_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgead_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgead_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgead_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgead_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgead_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgead_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: m x n
+	if(bi+m > sB->m) printf("\n***** sgead_libstr : bi+m > row(B) : %d+%d > %d *****\n", bi, m, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgead_libstr : bj+n > col(B) : %d+%d > %d *****\n", bj, n, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, mna;
+
+#if 1
+	if(offsetB>0)
+		{
+		if(offsetB>offsetA)
+			{
+			mna = bs-offsetB;
+			mna = m<mna ? m : mna;
+			kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[offsetA], &pB[offsetB], mna);
+			m -= mna;
+			//pA += 8*sda;
+			pB += 8*sdb;
+			}
+		else
+			{
+			if(offsetA==0)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_0_gen_lib8(n, &alpha, &pA[0], &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==1)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_1_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==2)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_2_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==3)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_3_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==4)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_4_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==5)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_5_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==6)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_6_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			else if(offsetA==7)
+				{
+				mna = bs-offsetB;
+				mna = m<mna ? m : mna;
+				kernel_sgead_8_7_gen_lib8(n, &alpha, &pA[0], sda, &pB[offsetB], mna);
+				m -= mna;
+				pA += 8*sda;
+				pB += 8*sdb;
+				}
+			}
+		}
+#endif
+
+	// same alignment
+	if(offsetA==offsetB)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_0_lib8(n, &alpha, pA, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_0_gen_lib8(n, &alpha, pA, pB, m-ii);
+			}
+		return;
+		}
+	// XXX different alignment: search tree ???
+	// skip one element of A
+	else if(offsetA==(offsetB+1)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_1_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_1_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		}
+	// skip two elements of A
+	else if(offsetA==(offsetB+2)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_2_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_2_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip three elements of A
+	else if(offsetA==(offsetB+3)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_3_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_3_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip four elements of A
+	else if(offsetA==(offsetB+4)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_4_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_4_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip five elements of A
+	else if(offsetA==(offsetB+5)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_5_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_5_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip six elements of A
+	else if(offsetA==(offsetB+6)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_6_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_6_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	// skip seven elements of A
+	else //if(offsetA==(offsetB+7)%bs)
+		{
+		ii = 0;
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_sgead_8_7_lib8(n, &alpha, pA, sda, pB);
+			pA += 8*sda;
+			pB += 8*sdb;
+			}
+		if(ii<m)
+			{
+			kernel_sgead_8_7_gen_lib8(n, &alpha, pA, sda, pB, m-ii);
+			}
+		return;
+		}
+	
+	return;
+
+	}
+
+
+
+// copy and transpose a generic strmat into a generic strmat
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj)
+	{
+
+	// early return
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgetr_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgetr_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgetr_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgetr_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgetr_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgetr_libstr : bj<0 : %d<0 *****\n", bj);
+	// inside matrix
+	// A: m x n
+	if(ai+m > sA->m) printf("\n***** sgetr_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** sgetr_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// B: n x m
+	if(bi+n > sB->m) printf("\n***** sgetr_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+	if(bj+m > sB->n) printf("\n***** sgetr_libstr : bj+m > col(B) : %d+%d > %d *****\n", bj, m, sB->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + aj*bs;
+	float *pB = sB->pA + bi/bs*bs*sdb + bj*bs;
+	int offsetA = ai%bs;
+	int offsetB = bi%bs;
+
+	int ii, nna;
+
+	if(offsetA==0)
+		{
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_0_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for(ii=0; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_0_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_0_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	// TODO log serach for offsetA>0 ???
+	else if(offsetA==1)
+		{
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_1_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for(ii=0; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_1_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_1_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==2)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_2_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_2_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_2_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==3)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_3_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_3_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_3_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==4)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_4_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_4_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_4_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==5)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_5_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_5_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_5_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==6)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_6_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_6_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_6_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+	else if(offsetA==7)
+		{
+		ii = 0;
+		if(offsetB>0)
+			{
+			nna = bs-offsetB;
+			nna = n<nna ? n : nna;
+			kernel_sgetr_8_7_gen_lib8(m, &pA[0], sda, &pB[offsetB], nna);
+			n -= nna;
+			pA += nna*bs;
+			pB += 8*sdb;
+			}
+		for( ; ii<n-7; ii+=8)
+			{
+			kernel_sgetr_8_7_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb]);
+			}
+		if(ii<n)
+			{
+			kernel_sgetr_8_7_gen_lib8(m, &pA[ii*bs], sda, &pB[ii*sdb], n-ii);
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// copy and transpose a lower triangular strmat into an upper triangular strmat
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_l_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// copy and transpose an upper triangular strmat into a lower triangular strmat
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj)
+	{
+	const int bs = 8;
+	int sda = sA->cn;
+	float *pA = sA->pA + ai/bs*bs*sda + ai%bs + aj*bs;
+	int sdc = sC->cn;
+	float *pC = sC->pA + ci/bs*bs*sdc + ci%bs + cj*bs;
+	strtr_u_lib(m, 1.0, ai%bs, pA, sda, ci%bs, pC, sdc); // TODO remove alpha !!!
+	return;
+	}
+
+
+
+// insert a strvec to diagonal of strmat, sparse formulation 
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// extract the diagonal of a strmat to a strvec, sparse formulation 
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		x[jj] = alpha * pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to diagonal of strmat, sparse formulation 
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] += alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to another strvec and insert to diagonal of strmat, sparse formulation 
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	int sdd = sD->cn;
+	float *pD = sD->pA;
+	int ii, jj;
+	for(jj=0; jj<kmax; jj++)
+		{
+		ii = idx[jj];
+		pD[(ii+di)/bs*bs*sdd+(ii+di)%bs+(ii+dj)*bs] = y[jj] + alpha * x[jj];
+		}
+	return;
+	}
+
+
+
+// add scaled strvec to row of strmat, sparse formulation 
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj)
+	{
+	const int bs = 8;
+	float *x = sx->pa + xi;
+	int sdd = sD->cn;
+	float *pD = sD->pA + di/bs*bs*sdd + di%bs + dj*bs;
+	srowad_libsp(kmax, idx, alpha, x, pD);
+	return;
+	}
+
+
+
+// adds strvec to strvec, sparse formulation
+void svecad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sy, int yi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	svecad_libsp(kmax, idx, alpha, x, y);
+	return;
+	}
+
+
+
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[idx[ii]] = alpha * x[ii];
+	return;
+	}
+
+
+
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+	int ii;
+	for(ii=0; ii<m; ii++)
+		z[ii] = alpha * x[idx[ii]];
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/auxiliary/v_aux_ext_dep_lib.c b/auxiliary/v_aux_ext_dep_lib.c
new file mode 100644
index 0000000..3bf5f90
--- /dev/null
+++ b/auxiliary/v_aux_ext_dep_lib.c
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#if 0
+#include <malloc.h>
+#endif
+
+
+
+/* creates a zero matrix given the size in bytes */
+void v_zeros(void **ptrA, int size)
+	{
+	*ptrA = (void *) malloc(size);
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void v_zeros_align(void **ptrA, int size)
+	{
+#if defined(OS_WINDOWS)
+	*ptrA = _aligned_malloc( size, 64 );
+#else
+	int err = posix_memalign(ptrA, 64, size);
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+#endif
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* frees matrix */
+void v_free(void *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void v_free_align(void *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
+
+
+/* creates a zero matrix given the size in bytes */
+void c_zeros(char **ptrA, int size)
+	{
+	*ptrA = malloc(size);
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* creates a zero matrix aligned to a cache line given the size in bytes */
+void c_zeros_align(char **ptrA, int size)
+	{
+#if defined(OS_WINDOWS)
+	*ptrA = _aligned_malloc( size, 64 );
+#else
+	void *temp;
+	int err = posix_memalign(&temp, 64, size);
+	if(err!=0)
+		{
+		printf("Memory allocation error");
+		exit(1);
+		}
+	*ptrA = temp;
+#endif
+	char *A = *ptrA;
+	int i;
+	for(i=0; i<size; i++) A[i] = 0;
+	}
+
+
+
+/* frees matrix */
+void c_free(char *pA)
+	{
+	free( pA );
+	}
+
+
+
+/* frees aligned matrix */
+void c_free_align(char *pA)
+	{
+#if defined(OS_WINDOWS)
+	_aligned_free( pA );
+#else
+	free( pA );
+#endif
+	}
+
diff --git a/blas/Makefile b/blas/Makefile
new file mode 100644
index 0000000..304b448
--- /dev/null
+++ b/blas/Makefile
@@ -0,0 +1,88 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib8.o s_blas2_lib8.o s_blas2_diag_lib.o s_blas3_lib8.o s_blas3_diag_lib8.o s_lapack_lib8.o
+endif
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+ifeq ($(TARGET), GENERIC)
+OBJS += d_blas1_lib4.o d_blas2_lib4.o d_blas2_diag_lib.o d_blas3_lib4.o d_blas3_diag_lib4.o d_lapack_lib4.o
+OBJS += s_blas1_lib4.o s_blas2_lib4.o s_blas2_diag_lib.o s_blas3_lib4.o s_blas3_diag_lib4.o s_lapack_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+OBJS += d_blas1_lib.o d_blas2_lib.o d_blas2_diag_lib.o d_blas3_lib.o d_blas3_diag_lib.o d_lapack_lib.o
+OBJS += s_blas1_lib.o s_blas2_lib.o s_blas2_diag_lib.o s_blas3_lib.o s_blas3_diag_lib.o s_lapack_lib.o
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
+d_blas1_lib.o: d_blas1_lib.c x_blas1_lib.c
+s_blas1_lib.o: s_blas1_lib.c x_blas1_lib.c
+d_blas2_lib.o: d_blas2_lib.c x_blas2_lib.c
+s_blas2_lib.o: s_blas2_lib.c x_blas2_lib.c
+d_blas2_diag_lib.o: d_blas2_diag_lib.c x_blas2_diag_lib.c
+s_blas2_diag_lib.o: s_blas2_diag_lib.c x_blas2_diag_lib.c
+d_blas3_lib.o: d_blas3_lib.c x_blas3_lib.c
+s_blas3_lib.o: s_blas3_lib.c x_blas3_lib.c
+d_blas3_diag_lib.o: d_blas3_diag_lib.c x_blas3_diag_lib.c
+s_blas3_diag_lib.o: s_blas3_diag_lib.c x_blas3_diag_lib.c
+d_lapack_lib.o: d_lapack_lib.c x_lapack_lib.c
+s_lapack_lib.o: s_lapack_lib.c x_lapack_lib.c
diff --git a/blas/d_blas.h b/blas/d_blas.h
new file mode 100644
index 0000000..fc5058b
--- /dev/null
+++ b/blas/d_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(int *m, double *x, int *incx, double *y, int *incy);
+void daxpy_(int *m, double *alpha, double *x, int *incx, double *y, int *incy);
+void dscal_(int *m, double *alpha, double *x, int *incx);
+
+// level 2
+void dgemv_(char *ta, int *m, int *n, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dsymv_(char *uplo, int *m, double *alpha, double *A, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, int *n, double *A, int *lda, double *x, int *incx);
+void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *A, int *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+void dsyrk_(char *uplo, char *trans, int *n, int *k, double *alpha, double *A, int *lda, double *beta, double *C, int *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, int *m, int *n, double *alpha, double *A, int *lda, double *B, int *ldb);
+
+// lapack
+int dpotrf_(char *uplo, int *m, double *A, int *lda, int *info);
+int dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
+void dgeqrf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+void dgeqr2_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *info);
+void dgelqf_(int *m, int *n, double *A, int *lda, double *tau, double *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_blas1_lib.c b/blas/d_blas1_lib.c
new file mode 100644
index 0000000..1fd19d3
--- /dev/null
+++ b/blas/d_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define AXPY_LIBSTR daxpy_libstr
+#define VECMULDOT_LIBSTR dvecmuldot_libstr
+#define DOT_LIBSTR ddot_libstr
+
+#define AXPY daxpy_
+#define COPY dcopy_
+
+
+#include "x_blas1_lib.c"
diff --git a/blas/d_blas1_lib4.c b/blas/d_blas1_lib4.c
new file mode 100644
index 0000000..a4155a9
--- /dev/null
+++ b/blas/d_blas1_lib4.c
@@ -0,0 +1,263 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void daxpy_libstr(int m, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	double *z = sz->pa + zi;
+
+	int ii;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m256d
+		v_alpha, v_tmp,
+		v_x0, v_y0,
+		v_x1, v_y1;
+#endif
+
+	ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	v_alpha = _mm256_broadcast_sd( &alpha );
+	for( ; ii<m-7; ii+=8)
+		{
+		v_x0  = _mm256_loadu_pd( &x[ii+0] );
+		v_x1  = _mm256_loadu_pd( &x[ii+4] );
+		v_y0  = _mm256_loadu_pd( &y[ii+0] );
+		v_y1  = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+		v_y0  = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+		v_y1  = _mm256_fmadd_pd( v_alpha, v_x1, v_y1 );
+#else // sandy bridge
+		v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+		v_y0  = _mm256_add_pd( v_tmp, v_y0 );
+		v_tmp = _mm256_mul_pd( v_alpha, v_x1 );
+		v_y1  = _mm256_add_pd( v_tmp, v_y1 );
+#endif
+		_mm256_storeu_pd( &z[ii+0], v_y0 );
+		_mm256_storeu_pd( &z[ii+4], v_y1 );
+		}
+	for( ; ii<m-3; ii+=4)
+		{
+		v_x0  = _mm256_loadu_pd( &x[ii] );
+		v_y0  = _mm256_loadu_pd( &y[ii] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+		v_y0  = _mm256_fmadd_pd( v_alpha, v_x0, v_y0 );
+#else // sandy bridge
+		v_tmp = _mm256_mul_pd( v_alpha, v_x0 );
+		v_y0  = _mm256_add_pd( v_tmp, v_y0 );
+#endif
+		_mm256_storeu_pd( &z[ii], v_y0 );
+		}
+#else
+	for( ; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+#endif
+	for( ; ii<m; ii++)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		}
+
+	return;
+	}
+
+
+
+// multiply two vectors and compute dot product
+double dvecmuldot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return 0.0;
+
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	double *z = sz->pa + zi;
+	int ii;
+	double dot = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m128d
+		u_tmp, u_dot;
+	__m256d
+		v_tmp,
+		v_x0, v_y0, v_z0;
+	
+	v_tmp = _mm256_setzero_pd();
+#endif
+
+	ii = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; ii<m-3; ii+=4)
+		{
+		v_x0 = _mm256_loadu_pd( &x[ii+0] );
+		v_y0 = _mm256_loadu_pd( &y[ii+0] );
+		v_z0 = _mm256_mul_pd( v_x0, v_y0 );
+		_mm256_storeu_pd( &z[ii+0], v_z0 );
+		v_tmp = _mm256_add_pd( v_tmp, v_z0 );
+		}
+#endif
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		dot += z[ii+0];
+		}
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	// dot product
+	u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_tmp ), _mm256_extractf128_pd( v_tmp, 0x1 ) );
+	u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+	u_dot = _mm_load_sd( &dot );
+	u_dot = _mm_add_sd( u_dot, u_tmp );
+	_mm_store_sd( &dot, u_dot );
+#endif
+	return dot;
+	}
+
+
+
+// compute dot product of two vectors
+double ddot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi)
+	{
+
+	if(m<=0)
+		return 0.0;
+
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	int ii;
+	double dot = 0.0;
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	__m128d
+		u_dot0, u_x0, u_y0, u_tmp;
+	__m256d
+		v_dot0, v_dot1, v_x0, v_x1, v_y0, v_y1, v_tmp;
+	
+	v_dot0 = _mm256_setzero_pd();
+	v_dot1 = _mm256_setzero_pd();
+	u_dot0 = _mm_setzero_pd();
+
+	ii = 0;
+	for(; ii<m-7; ii+=8)
+		{
+		v_x0 = _mm256_loadu_pd( &x[ii+0] );
+		v_x1 = _mm256_loadu_pd( &x[ii+4] );
+		v_y0 = _mm256_loadu_pd( &y[ii+0] );
+		v_y1 = _mm256_loadu_pd( &y[ii+4] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+		v_dot0  = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+		v_dot1  = _mm256_fmadd_pd( v_x1, v_y1, v_dot1 );
+#else // sandy bridge
+		v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+		v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+		v_tmp = _mm256_mul_pd( v_x1, v_y1 );
+		v_dot1 = _mm256_add_pd( v_dot1, v_tmp );
+#endif
+		}
+	for(; ii<m-3; ii+=4)
+		{
+		v_x0 = _mm256_loadu_pd( &x[ii+0] );
+		v_y0 = _mm256_loadu_pd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+		v_dot0  = _mm256_fmadd_pd( v_x0, v_y0, v_dot0 );
+#else // sandy bridge
+		v_tmp = _mm256_mul_pd( v_x0, v_y0 );
+		v_dot0 = _mm256_add_pd( v_dot0, v_tmp );
+#endif
+		}
+	for(; ii<m; ii++)
+		{
+		u_x0 = _mm_load_sd( &x[ii+0] );
+		u_y0 = _mm_load_sd( &y[ii+0] );
+#if defined(TARGET_X64_INTEL_HASWELL)
+		u_dot0  = _mm_fmadd_sd( u_x0, u_y0, u_dot0 );
+#else // sandy bridge
+		u_tmp = _mm_mul_sd( u_x0, u_y0 );
+		u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+#endif
+		}
+	// reduce
+	v_dot0 = _mm256_add_pd( v_dot0, v_dot1 );
+	u_tmp = _mm_add_pd( _mm256_castpd256_pd128( v_dot0 ), _mm256_extractf128_pd( v_dot0, 0x1 ) );
+	u_tmp = _mm_hadd_pd( u_tmp, u_tmp);
+	u_dot0 = _mm_add_sd( u_dot0, u_tmp );
+	_mm_store_sd( &dot, u_dot0 );
+#else // no haswell, no sandy bridge
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		dot += x[ii+0] * y[ii+0];
+		dot += x[ii+1] * y[ii+1];
+		dot += x[ii+2] * y[ii+2];
+		dot += x[ii+3] * y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		dot += x[ii+0] * y[ii+0];
+		}
+#endif // haswell, sandy bridge
+	return dot;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas2_diag_lib.c b/blas/d_blas2_diag_lib.c
new file mode 100644
index 0000000..8bc3f68
--- /dev/null
+++ b/blas/d_blas2_diag_lib.c
@@ -0,0 +1,45 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRVEC d_strvec
+
+#define GEMV_DIAG_LIBSTR dgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
diff --git a/blas/d_blas2_lib.c b/blas/d_blas2_lib.c
new file mode 100644
index 0000000..9c39fe2
--- /dev/null
+++ b/blas/d_blas2_lib.c
@@ -0,0 +1,71 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "d_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMV_N_LIBSTR dgemv_n_libstr
+#define GEMV_NT_LIBSTR dgemv_nt_libstr
+#define GEMV_T_LIBSTR dgemv_t_libstr
+#define SYMV_L_LIBSTR dsymv_l_libstr
+#define TRMV_LNN_LIBSTR dtrmv_lnn_libstr
+#define TRMV_LTN_LIBSTR dtrmv_ltn_libstr
+#define TRMV_UNN_LIBSTR dtrmv_unn_libstr
+#define TRMV_UTN_LIBSTR dtrmv_utn_libstr
+#define TRSV_LNN_LIBSTR dtrsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR dtrsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR dtrsv_lnu_libstr
+#define TRSV_LTN_LIBSTR dtrsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR dtrsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR dtrsv_ltu_libstr
+#define TRSV_UNN_LIBSTR dtrsv_unn_libstr
+#define TRSV_UTN_LIBSTR dtrsv_utn_libstr
+
+#define COPY dcopy_
+#define GEMV dgemv_
+#define SYMV dsymv_
+#define TRMV dtrmv_
+#define TRSV dtrsv_
+
+
+
+#include "x_blas2_lib.c"
diff --git a/blas/d_blas2_lib4.c b/blas/d_blas2_lib4.c
new file mode 100644
index 0000000..cab8e3c
--- /dev/null
+++ b/blas/d_blas2_lib4.c
@@ -0,0 +1,1060 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+void dtrsv_ln_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	// suppose m>=n
+	if(m<n)
+		m = n;
+
+	const int bs = 4;
+
+	double alpha = -1.0;
+	double beta = 1.0;
+
+	int i;
+
+	if(x!=y)
+		{
+		for(i=0; i<m; i++)
+			y[i] = x[i];
+		}
+	
+	i = 0;
+	for( ; i<n-3; i+=4)
+		{
+		kernel_dtrsv_ln_inv_4_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i]);
+		}
+	if(i<n)
+		{
+		kernel_dtrsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &inv_diag_A[i], y, &y[i], &y[i], m-i, n-i);
+		i+=4;
+		}
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	for( ; i<m-7; i+=8)
+		{
+		kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, y, &beta, &y[i], &y[i]);
+		}
+	if(i<m-3)
+		{
+		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+		i+=4;
+		}
+#else
+	for( ; i<m-3; i+=4)
+		{
+		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i]);
+		}
+#endif
+	if(i<m)
+		{
+		kernel_dgemv_n_4_gen_lib4(n, &alpha, &pA[i*sda], y, &beta, &y[i], &y[i], 0, m-i);
+		i+=4;
+		}
+
+	}
+
+
+
+void dtrsv_lt_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *x, double *y)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	if(n>m)
+		n = m;
+	
+	const int bs = 4;
+	
+	int i;
+	
+	if(x!=y)
+		for(i=0; i<m; i++)
+			y[i] = x[i];
+			
+	i=0;
+	if(n%4==1)
+		{
+		kernel_dtrsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &inv_diag_A[n-i-1], &y[n-i-1], &y[n-i-1], &y[n-i-1]);
+		i++;
+		}
+	else if(n%4==2)
+		{
+		kernel_dtrsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &inv_diag_A[n-i-2], &y[n-i-2], &y[n-i-2], &y[n-i-2]);
+		i+=2;
+		}
+	else if(n%4==3)
+		{
+		kernel_dtrsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &inv_diag_A[n-i-3], &y[n-i-3], &y[n-i-3], &y[n-i-3]);
+		i+=3;
+		}
+	for(; i<n-3; i+=4)
+		{
+		kernel_dtrsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &inv_diag_A[n-i-4], &y[n-i-4], &y[n-i-4], &y[n-i-4]);
+		}
+
+	}
+
+
+
+void dgemv_nt_lib(int m, int n, double alpha_n, double alpha_t, double *pA, int sda, double *x_n, double *x_t, double beta_n, double beta_t, double *y_n, double *y_t, double *z_n, double *z_t)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+	const int bs = 4;
+
+	int ii;
+
+	// copy and scale y_n int z_n
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		z_n[ii+1] = beta_n*y_n[ii+1];
+		z_n[ii+2] = beta_n*y_n[ii+2];
+		z_n[ii+3] = beta_n*y_n[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		}
+	
+	ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; ii<n-5; ii+=6)
+		{
+		kernel_dgemv_nt_6_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+		}
+#endif
+	for(; ii<n-3; ii+=4)
+		{
+		kernel_dgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+		}
+	if(ii<n)
+		{
+		kernel_dgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+		}
+	
+	return;
+
+	}
+
+
+	
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<0)
+		return;
+
+	const int bs = 4;
+
+	int i;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	double *z = sz->pa + zi;
+
+	i = 0;
+	// clean up at the beginning
+	if(ai%bs!=0)
+		{
+		kernel_dgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+		pA += bs*sda;
+		y += 4 - ai%bs;
+		z += 4 - ai%bs;
+		m -= 4 - ai%bs;
+		}
+	// main loop
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for( ; i<m-11; i+=12)
+		{
+		kernel_dgemv_n_12_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+		}
+#endif
+	for( ; i<m-7; i+=8)
+		{
+		kernel_dgemv_n_8_lib4(n, &alpha, &pA[i*sda], sda, x, &beta, &y[i], &z[i]);
+		}
+	if(i<m-3)
+		{
+		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+		i+=4;
+		}
+#else
+	for( ; i<m-3; i+=4)
+		{
+		kernel_dgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+		}
+#endif
+	if(i<m)
+		{
+		kernel_dgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+		}
+		
+	return;
+
+	}
+
+
+
+void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+	{
+
+	if(n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int i;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	double *z = sz->pa + zi;
+
+	if(ai%bs==0)
+		{
+		i = 0;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for( ; i<n-11; i+=12)
+			{
+			kernel_dgemv_t_12_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			}
+#endif
+		for( ; i<n-7; i+=8)
+			{
+			kernel_dgemv_t_8_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			}
+		if(i<n-3)
+			{
+			kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			i+=4;
+			}
+#else
+		for( ; i<n-3; i+=4)
+			{
+			kernel_dgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			}
+#endif
+		if(i<n)
+			{
+			kernel_dgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		}
+	else // TODO kernel 8
+		{
+		i = 0;
+		for( ; i<n; i+=4)
+			{
+			kernel_dgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		}
+	
+	return;
+
+	}
+
+
+
+void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t)
+	{
+	if(ai!=0)
+		{
+		printf("\ndgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *x_n = sx_n->pa + xi_n;
+	double *x_t = sx_t->pa + xi_t;
+	double *y_n = sy_n->pa + yi_n;
+	double *y_t = sy_t->pa + yi_t;
+	double *z_n = sz_n->pa + zi_n;
+	double *z_t = sz_t->pa + zi_t;
+	dgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+	return;
+	}
+
+
+
+void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int ii, n1;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	double *x = sx->pa + xi;
+	double *y = sy->pa + yi;
+	double *z = sz->pa + zi;
+
+	// copy and scale y int z
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z[ii+0] = beta*y[ii+0];
+		z[ii+1] = beta*y[ii+1];
+		z[ii+2] = beta*y[ii+2];
+		z[ii+3] = beta*y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = beta*y[ii+0];
+		}
+	
+	// clean up at the beginning
+	if(ai%bs!=0) // 1, 2, 3
+		{
+		n1 = 4-ai%bs;
+		kernel_dsymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+		pA += n1 + n1*bs + (sda-1)*bs;
+		x += n1;
+		z += n1;
+		m -= n1;
+		n -= n1;
+		}
+	// main loop
+	ii = 0;
+	for(; ii<n-3; ii+=4)
+		{
+		kernel_dsymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+		}
+	// clean up at the end
+	if(ii<n)
+		{
+		kernel_dsymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+		}
+	
+	return;
+	}
+
+
+
+// m >= n
+void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+
+	if(m-n>0)
+		dgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+	double *pA2 = pA;
+	double *z2 = z;
+	int m2 = n;
+	int n2 = 0;
+	double *pA3, *x3;
+
+	double alpha = 1.0;
+	double beta = 1.0;
+
+	double zt[4];
+
+	int ii, jj, jj_end;
+
+	ii = 0;
+
+	if(ai%4!=0)
+		{
+		pA2 += sda*bs - ai%bs;
+		z2 += bs-ai%bs;
+		m2 -= bs-ai%bs;
+		n2 += bs-ai%bs;
+		}
+	
+	pA2 += m2/bs*bs*sda;
+	z2 += m2/bs*bs;
+	n2 += m2/bs*bs;
+
+	if(m2%bs!=0)
+		{
+		//
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		zt[0] = pA3[0+bs*0]*x3[0];
+		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+		for(jj=0; jj<m2%bs; jj++)
+			z2[jj] = zt[jj];
+		}
+	for(; ii<m2-3; ii+=4)
+		{
+		pA2 -= bs*sda;
+		z2 -= 4;
+		n2 -= 4;
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		z2[0] = pA3[0+bs*0]*x3[0];
+		kernel_dgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+		}
+	if(ai%4!=0)
+		{
+		if(ai%bs==1)
+			{
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==2)
+			{
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else // if (ai%bs==3)
+			{
+			z[0] = pA[0+bs*0]*x[0];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// m >= n
+void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+
+	double xt[4];
+	double zt[4];
+
+	double alpha = 1.0;
+	double beta = 1.0;
+
+	int ii, jj, ll, ll_max;
+
+	jj = 0;
+
+	if(ai%bs!=0)
+		{
+
+		if(ai%bs==1)
+			{
+			ll_max = m-jj<3 ? m-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<3; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+			zt[2] = pA[2+bs*2]*xt[2];
+			pA += bs*sda - 1;
+			x += 3;
+			kernel_dgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<3 ? n-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*3;
+			z += 3;
+			jj += 3;
+			}
+		else if(ai%bs==2)
+			{
+			ll_max = m-jj<2 ? m-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<2; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+			zt[1] = pA[1+bs*1]*xt[1];
+			pA += bs*sda - 2;
+			x += 2;
+			kernel_dgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<2 ? n-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*2;
+			z += 2;
+			jj += 2;
+			}
+		else // if(ai%bs==3)
+			{
+			ll_max = m-jj<1 ? m-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<1; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0];
+			pA += bs*sda - 3;
+			x += 1;
+			kernel_dgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<1 ? n-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*1;
+			z += 1;
+			jj += 1;
+			}
+
+		}
+	
+	for(; jj<n-3; jj+=4)
+		{
+		zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+		zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+		zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+		zt[3] = pA[3+bs*3]*x[3];
+		pA += bs*sda;
+		x += 4;
+		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+		pA += bs*4;
+		z += 4;
+		}
+	if(jj<n)
+		{
+		ll_max = m-jj<4 ? m-jj : 4;
+		for(ll=0; ll<ll_max; ll++)
+			xt[ll] = x[ll];
+		for(; ll<4; ll++)
+			xt[ll] = 0.0;
+		zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+		zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+		zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+		zt[3] = pA[3+bs*3]*xt[3];
+		pA += bs*sda;
+		x += 4;
+		kernel_dgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+		for(ll=0; ll<n-jj; ll++)
+			z[ll] = zt[ll];
+//		pA += bs*4;
+//		z += 4;
+		}
+
+	return;
+
+	}
+
+
+
+void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ai!=0)
+		{
+		printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+
+	int i;
+	
+	i=0;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		kernel_dtrmv_un_8_lib4(m-i, pA, sda, x, z);
+		pA += 8*sda+8*bs;
+		x  += 8;
+		z  += 8;
+		}
+#endif
+	for(; i<m-3; i+=4)
+		{
+		kernel_dtrmv_un_4_lib4(m-i, pA, x, z);
+		pA += 4*sda+4*bs;
+		x  += 4;
+		z  += 4;
+		}
+	if(m>i)
+		{
+		if(m-i==1)
+			{
+			z[0] = pA[0+bs*0]*x[0];
+			}
+		else if(m-i==2)
+			{
+			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+			z[1] = pA[1+bs*1]*x[1];
+			}
+		else // if(m-i==3)
+			{
+			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+			z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+			z[2] = pA[2+bs*2]*x[2];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ai!=0)
+		{
+		printf("\ndtrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+
+	int ii, idx;
+	
+	double *ptrA;
+	
+	ii=0;
+	idx = m/bs*bs;
+	if(m%bs!=0)
+		{
+		kernel_dtrmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+		ii += m%bs;
+		}
+	idx -= 4;
+	for(; ii<m; ii+=4)
+		{
+		kernel_dtrmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+		idx -= 4;
+		}
+
+	return;
+
+	}
+
+
+
+void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0 | n==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** dtrsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** dtrsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	if(ai!=0 | xi%4!=0)
+		{
+		printf("\ndtrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *dA = sA->dA;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsv_ln_inv_lib(m, n, pA, sda, dA, x, z);
+	return;
+	}
+
+
+
+void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** dtrsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** dtrsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	if(ai!=0 | xi%4!=0)
+		{
+		printf("\ndtrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *dA = sA->dA;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsv_lt_inv_lib(m, n, pA, sda, dA, x, z);
+	return;
+	}
+
+
+
+void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	if(ai!=0 | xi%4!=0)
+		{
+		printf("\ndtrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *dA = sA->dA;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsv_ln_inv_lib(m, m, pA, sda, dA, x, z);
+	return;
+	}
+
+
+
+void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** dtrsv_lnu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	if(ai!=0 | xi%4!=0)
+		{
+		printf("\ndtrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	double *pA = sA->pA + aj*bs; // TODO ai
+	double *dA = sA->dA;
+	double *x = sx->pa + xi;
+	double *z = sz->pa + zi;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsv_lt_inv_lib(m, m, pA, sda, dA, x, z);
+	return;
+	}
+
+
+
+void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** dtrsv_ltu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_unn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** dtrsv_unn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** dtrsv_utn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** dtrsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** dtrsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** dtrsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** dtrsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** dtrsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** dtrsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** dtrsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** dtrsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** dtrsv_utn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/d_blas3_diag_lib.c b/blas/d_blas3_diag_lib.c
new file mode 100644
index 0000000..ff69317
--- /dev/null
+++ b/blas/d_blas3_diag_lib.c
@@ -0,0 +1,47 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GEMM_R_DIAG_LIBSTR dgemm_r_diag_libstr
+#define GEMM_L_DIAG_LIBSTR dgemm_l_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
diff --git a/blas/d_blas3_diag_lib4.c b/blas/d_blas3_diag_lib4.c
new file mode 100644
index 0000000..2731d1f
--- /dev/null
+++ b/blas/d_blas3_diag_lib4.c
@@ -0,0 +1,184 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_diag_left_lib(int m, int n, double alpha, double *dA, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int ii;
+
+	ii = 0;
+	if(beta==0.0)
+		{
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+			}
+		}
+	else
+		{
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+			}
+		}
+	if(m-ii>0)
+		{
+		if(m-ii==1)
+			kernel_dgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		else if(m-ii==2)
+			kernel_dgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		else // if(m-ii==3)
+			kernel_dgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		}
+	
+	}
+
+
+
+void dgemm_diag_right_lib(int m, int n, double alpha, double *pA, int sda, double *dB, double beta, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	int ii;
+
+	ii = 0;
+	if(beta==0.0)
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_dgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+			}
+		}
+	else
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_dgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+			}
+		}
+	if(n-ii>0)
+		{
+		if(n-ii==1)
+			kernel_dgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else if(n-ii==2)
+			kernel_dgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else // if(n-ii==3)
+			kernel_dgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		}
+	
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void dgemm_l_diag_libstr(int m, int n, double alpha, struct d_strvec *sA, int ai, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	if(bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\ndgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *dA = sA->pa + ai;
+	double *pB = sB->pA + bj*bs;
+	double *pC = sC->pA + cj*bs;
+	double *pD = sD->pA + dj*bs;
+	dgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+	return;
+	}
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void dgemm_r_diag_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sB, int bi, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	if(ai!=0 | ci!=0 | di!=0)
+		{
+		printf("\ndgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*bs;
+	double *dB = sB->pa + bi;
+	double *pC = sC->pA + cj*bs;
+	double *pD = sD->pA + dj*bs;
+	dgemm_diag_right_lib(m, n, alpha, pA, sda, dB, beta, pC, sdc, pD, sdd);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas3_lib.c b/blas/d_blas3_lib.c
new file mode 100644
index 0000000..27c20ab
--- /dev/null
+++ b/blas/d_blas3_lib.c
@@ -0,0 +1,69 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+
+#define GEMM_NN_LIBSTR dgemm_nn_libstr
+#define GEMM_NT_LIBSTR dgemm_nt_libstr
+#define SYRK_LN_LIBSTR dsyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR dsyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR dtrmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR dtrmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR dtrsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR dtrsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR dtrsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR dtrsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR dtrsm_rutn_libstr
+
+#define COPY dcopy_
+#define GEMM dgemm_
+#define SYRK dsyrk_
+#define TRMM dtrmm_
+#define TRSM dtrsm_
+
+
+
+#include "x_blas3_lib.c"
diff --git a/blas/d_blas3_lib4.c b/blas/d_blas3_lib4.c
new file mode 100644
index 0000000..dfa3cb8
--- /dev/null
+++ b/blas/d_blas3_lib4.c
@@ -0,0 +1,2728 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void dgemm_nt_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<n-8; j+=12)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+		}
+	
+	if(j<n-4)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+		}
+	else if(j<n)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*ps+(i+4)*sdc], &pD[j*ps+(i+4)*sdd], m-(i+4), n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<n-8; j+=12)
+		{
+		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	if(j<n-4)
+		{
+		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	else if(j<n)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_4:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+#endif
+
+	}
+
+
+
+#if 0
+void dgemm_nn_lib(int m, int n, int k, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], 0, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+			}
+		if(j<n)
+			{
+			kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+
+	}
+#endif
+
+
+
+void dtrmm_nt_ru_lib(int m, int n, double alpha, double *pA, int sda, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+	
+	int i, j;
+	
+	i = 0;
+// XXX there is a bug here !!!!!!
+#if 0//defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n) // TODO specialized edge routine
+			{
+			kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		if(m-i<5)
+			{
+			goto left_4;
+			}
+		if(m-i<9)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n) // TODO specialized edge routine
+			{
+			kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		if(m-i<5)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+			}
+		if(j<n) // TODO specialized edge routine
+			{
+			kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		goto left_4;
+		}
+#endif
+	
+	// common return
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	// clean up
+	left_12:
+	j = 0;
+//	for(; j<n-3; j+=4)
+	for(; j<n; j+=4)
+		{
+		kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+//	if(j<n) // TODO specialized edge routine
+//		{
+//		kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+//		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	// clean up
+	left_8:
+	j = 0;
+//	for(; j<n-3; j+=4)
+	for(; j<n; j+=4)
+		{
+		kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+//	if(j<n) // TODO specialized edge routine
+//		{
+//		kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+//		}
+	return;
+#endif
+
+	left_4:
+	j = 0;
+//	for(; j<n-3; j+=4)
+	for(; j<n; j+=4)
+		{
+		kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+//	if(j<n) // TODO specialized edge routine
+//		{
+//		kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+//		}
+	return;
+
+	}
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void dtrsm_nt_rl_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+	
+	int i, j;
+	
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
+		}
+	return;
+#endif
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nt_ru_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+	
+	int i, j, idx;
+
+	int rn = n%4;
+
+	double *dummy;
+	
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		// clean at the end
+		if(rn>0)
+			{
+			idx = n-rn;
+			kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+			j += rn;
+			}
+		for(; j<n; j+=4)
+			{
+			idx = n-j-4;
+			kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		// clean at the end
+		if(rn>0)
+			{
+			idx = n-rn;
+			kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+			j += rn;
+			}
+		for(; j<n; j+=4)
+			{
+			idx = n-j-4;
+			kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		// clean at the end
+		if(rn>0)
+			{
+			idx = n-rn;
+			kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+			j += rn;
+			}
+		for(; j<n; j+=4)
+			{
+			idx = n-j-4;
+			kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx]);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	// TODO
+	// clean at the end
+	if(rn>0)
+		{
+		idx = n-rn;
+		kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+		j += rn;
+		}
+	for(; j<n; j+=4)
+		{
+		idx = n-j-4;
+		kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+		}
+	return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	// TODO
+	// clean at the end
+	if(rn>0)
+		{
+		idx = n-rn;
+		kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+		j += rn;
+		}
+	for(; j<n; j+=4)
+		{
+		idx = n-j-4;
+		kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+		}
+	return;
+
+#endif
+
+	left_4:
+	j = 0;
+	// TODO
+	// clean at the end
+	if(rn>0)
+		{
+		idx = n-rn;
+		kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, rn);
+		j += rn;
+		}
+	for(; j<n; j+=4)
+		{
+		idx = n-j-4;
+		kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &inv_diag_A[idx], m-i, 4);
+		}
+	return;
+
+	}
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void dtrsm_nn_ll_one_lib(int m, int n, double *pA, int sda, double *pB, int sdb, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+	
+	int i, j;
+	
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; i<m-11; i+=12)
+		{
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		if(m-i<=4)
+			goto left_4;
+		if(m-i<=8)
+			goto left_8;
+		else
+			goto left_12;
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for( ; i<m-7; i+=8)
+		{
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		if(m-i<=4)
+			goto left_4;
+		else
+			goto left_8;
+		}
+#else
+	for( ; i<m-3; i+=4)
+		{
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for( ; j<n; j+=4)
+		{
+		kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for( ; j<n; j+=4)
+		{
+		kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
+		}
+	return;
+#endif
+
+	left_4:
+	j = 0;
+	for( ; j<n; j+=4)
+		{
+		kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_nn_lu_inv_lib(int m, int n, double *pA, int sda, double *inv_diag_A, double *pB, int sdb, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int ps = 4;
+	
+	int i, j, idx;
+	double *dummy;
+	
+	i = 0;
+	int rm = m%4;
+	if(rm>0)
+		{
+		// TODO code expliticly the final case
+		idx = m-rm; // position of the part to do
+		j = 0;
+		for( ; j<n; j+=4)
+			{
+			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, inv_diag_A+idx, rm, n-j);
+			}
+		// TODO
+		i += rm;
+		}
+//	int em = m-rm;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; i<m-8; i+=12)
+		{
+		idx = m-i; // position of already done part
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12));
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, inv_diag_A+(idx-12), 12, n-j);
+//			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+//			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+//			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, inv_diag_A+(idx-12), 4, n-j);
+			}
+		}
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	for( ; i<m-4; i+=8)
+		{
+		idx = m-i; // position of already done part
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8));
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, inv_diag_A+(idx-8), 8, n-j);
+//			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+//			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, inv_diag_A+(idx-8), 4, n-j);
+			}
+		}
+#endif
+	for( ; i<m; i+=4)
+		{
+		idx = m-i; // position of already done part
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4));
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, inv_diag_A+(idx-4), 4, n-j);
+			}
+		}
+
+	// common return
+	return;
+
+	}
+
+
+
+#if 0
+void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	// TODO remove
+	double alpha, beta;
+	if(alg==0)
+		{
+		alpha = 1.0;
+		beta = 0.0;
+		}
+	else if(alg==1)
+		{
+		alpha = 1.0;
+		beta = 1.0;
+		}
+	else
+		{
+		alpha = -1.0;
+		beta = 1.0;
+		}
+
+	// TODO remove
+	int k = cv[nv-1];
+
+	const int ps = 4;
+
+	int i, j, l;
+	int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+	i = 0;
+	ii = 0;
+	iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-7; i+=8)
+		{
+
+		while(ii<nv && rv[ii]<i+8)
+			ii++;
+		if(ii<nv)
+			kii = cv[ii];
+		else
+			kii = cv[ii-1];
+
+		j = 0;
+		jj = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			if(j<i) // dgemm
+				{
+				kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+				}
+			else // dsyrk
+				{
+				kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
+				if(j<n-4)
+					{
+					kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+
+		while(ii<nv && rv[ii]<i+4)
+			ii++;
+		if(ii<nv)
+			kii = cv[ii];
+		else
+			kii = cv[ii-1];
+//		k0 = kii;
+//		printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
+
+		j = 0;
+		jj = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+//			printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+			kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+			}
+		if(j<n)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+//			printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
+
+			if(i<j) // dgemm
+				{
+				kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+				}
+			else // dsyrk
+				{
+				kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+
+	kii = cv[nv-1];
+
+	j = 0;
+	jj = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	if(j<n)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		if(j<i) // dgemm
+			{
+			kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			}
+		else // dsyrk
+			{
+			kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+			if(j<n-4)
+				{
+				kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
+				}
+			}
+		}
+	return;
+#endif
+
+	left_4:
+
+	kii = cv[nv-1];
+
+	j = 0;
+	jj = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		if(j<i) // dgemm
+			{
+			kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			}
+		else // dsyrk
+			{
+			kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+			}
+		}
+	return;
+
+	}
+#endif
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+	
+	const int ps = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	int air = ai & (ps-1);
+	int bir = bi & (ps-1);
+	double *pA = sA->pA + aj*ps + (ai-air)*sda;
+	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+
+	if(ai==0 & bi==0 & ci==0 & di==0)
+		{
+		dgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd); 
+		return;
+		}
+	
+	int ci0 = ci-air;
+	int di0 = di-air;
+	int offsetC;
+	int offsetD;
+	if(ci0>=0)
+		{
+		pC += ci0/ps*ps*sdd;
+		offsetC = ci0%ps;
+		}
+	else
+		{
+		pC += -4*sdc;
+		offsetC = ps+ci0;
+		}
+	if(di0>=0)
+		{
+		pD += di0/ps*ps*sdd;
+		offsetD = di0%ps;
+		}
+	else
+		{
+		pD += -4*sdd;
+		offsetD = ps+di0;
+		}
+	
+	int i, j, l;
+
+	int idxB;
+
+	// clean up at the beginning
+	if(air!=0)
+		{
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		if(m>5)
+			{
+			j = 0;
+			idxB = 0;
+			// clean up at the beginning
+			if(bir!=0)
+				{
+				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+				j += ps-bir;
+				idxB += 4;
+				}
+			// main loop
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+				idxB += 4;
+				}
+			m -= 2*ps-air;
+			pA += 2*ps*sda;
+			pC += 2*ps*sdc;
+			pD += 2*ps*sdd;
+			}
+		else // m<=4
+			{
+#endif
+			j = 0;
+			idxB = 0;
+			// clean up at the beginning
+			if(bir!=0)
+				{
+				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, n-j);
+				j += ps-bir;
+				idxB += 4;
+				}
+			// main loop
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+				idxB += 4;
+				}
+			m -= ps-air;
+			pA += ps*sda;
+			pC += ps*sdc;
+			pD += ps*sdd;
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+			// nothing more to do
+			return;
+			}
+#endif
+		}
+	i = 0;
+	// main loop
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-4; i+=8)
+		{
+		j = 0;
+		idxB = 0;
+		// clean up at the beginning
+		if(bir!=0)
+			{
+			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+			j += ps-bir;
+			idxB += 4;
+			}
+		// main loop
+		for(; j<n; j+=4)
+			{
+			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+			idxB += 4;
+			}
+		}
+	if(i<m)
+		{
+		j = 0;
+		idxB = 0;
+		// clean up at the beginning
+		if(bir!=0)
+			{
+			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+			j += ps-bir;
+			idxB += 4;
+			}
+		// main loop
+		for(; j<n; j+=4)
+			{
+			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+			idxB += 4;
+			}
+		}
+#else
+	for(; i<m; i+=4)
+		{
+		j = 0;
+		idxB = 0;
+		// clean up at the beginning
+		if(bir!=0)
+			{
+			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, n-j);
+			j += ps-bir;
+			idxB += 4;
+			}
+		// main loop
+		for(; j<n; j+=4)
+			{
+			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+			idxB += 4;
+			}
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+// dgemm nn
+void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int ps = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	int air = ai & (ps-1);
+	int bir = bi & (ps-1);
+	double *pA = sA->pA + aj*ps + (ai-air)*sda;
+	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+
+	int offsetB = bir;
+
+	int ci0 = ci-air;
+	int di0 = di-air;
+	int offsetC;
+	int offsetD;
+	if(ci0>=0)
+		{
+		pC += ci0/ps*ps*sdd;
+		offsetC = ci0%ps;
+		}
+	else
+		{
+		pC += -4*sdc;
+		offsetC = ps+ci0;
+		}
+	if(di0>=0)
+		{
+		pD += di0/ps*ps*sdd;
+		offsetD = di0%ps;
+		}
+	else
+		{
+		pD += -4*sdd;
+		offsetD = ps+di0;
+		}
+	
+	int i, j, l;
+
+	// clean up at the beginning
+	if(air!=0)
+		{
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+		if(m>5)
+			{
+			j = 0;
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+				}
+			m -= 2*ps-air;
+			pA += 2*ps*sda;
+			pC += 2*ps*sda;
+			pD += 2*ps*sda;
+			}
+		else // m-i<=4
+			{
+#endif
+			j = 0;
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
+				}
+			m -= 2*ps-air;
+			pA += 2*ps*sda;
+			pC += 2*ps*sda;
+			pD += 2*ps*sda;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+			// nothing more to do
+			return;
+			}
+#endif
+		}
+	// main loop
+	i = 0;
+	if(offsetC==0 & offsetD==0)
+		{
+#if defined(TARGET_X64_INTEL_HASWELL)
+		for(; i<m-11; i+=12)
+			{
+			j = 0;
+			for(; j<n-3; j+=4)
+				{
+				kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			if(j<n)
+				{
+//				kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+				kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+(i+8)*sdc], sdc, 0, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+				}
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else if(m-i<=8)
+				{
+				goto left_8;
+				}
+			else
+				{
+				goto left_12;
+				}
+			}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-7; i+=8)
+			{
+			j = 0;
+			for(; j<n-3; j+=4)
+				{
+				kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			if(j<n)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else
+				{
+				goto left_8;
+				}
+			}
+#else
+		for(; i<m-3; i+=4)
+			{
+			j = 0;
+			for(; j<n-3; j+=4)
+				{
+				kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+				}
+			if(j<n)
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, 0, &pC[j*ps+i*sdc], sdc, 0, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			}
+		if(m>i)
+			{
+			goto left_4;
+			}
+#endif
+		}
+	else
+		{
+// TODO 12x4
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-4; i+=8)
+			{
+			j = 0;
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			}
+		if(m>i)
+			{
+			goto left_4;
+			}
+#else
+		for(; i<m; i+=4)
+			{
+			j = 0;
+			for(; j<n; j+=4)
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			}
+#endif
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+//		kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], offsetB, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+(i+8)*sdc], sdc, offsetD, &pD[j*ps+(i+8)*sdd], sdd, 0, m-(i+8), 0, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		}
+	return;
+#endif
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		}
+	return;
+
+	return;
+	}
+	
+
+
+// dtrsm_nn_llu
+void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\ndtrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int ps = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	dtrsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_nn_lun
+void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\ndtrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int ps = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dA = sA->dA;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\ndtrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	// TODO alpha !!!!!
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dA = sA->dA;
+
+	int i, j;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(i=0; i<n; i++)
+				dA[i] = 1.0 / dA[i];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(i=0; i<n; i++)
+			dA[i] = 1.0 / dA[i];
+		sA->use_dA = 0;
+		}
+
+//	dtrsm_nt_rl_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd); 
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
+			}
+		if(j<n)
+			{
+			kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<n-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+		kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
+		}
+	if(j<n-4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
+		j += 8;
+		}
+	else if(j<n)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+		j += 4;
+		}
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<n-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+		}
+	if(j<n-4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
+		j += 8;
+		}
+	else if(j<n)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+		j += 4;
+		}
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
+		}
+	return;
+#endif
+
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\ndtrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int ps = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	dtrsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\ndtrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int ps = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dA = sA->dA;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0)
+		{
+		printf("\ndtrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+		exit(1);
+		}
+	const int ps = 4;
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pD = sD->pA + dj*ps;
+	dtrmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sB, int bi, int bj, struct d_strmat *sA, int ai, int aj, struct d_strmat *sD, int di, int dj)
+	{
+
+	const int ps = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	int air = ai & (ps-1);
+	int bir = bi & (ps-1);
+	double *pA = sA->pA + aj*ps + (ai-air)*sda;
+	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
+	double *pD = sD->pA + dj*ps;
+
+	int offsetB = bir;
+
+	int di0 = di-air;
+	int offsetD;
+	if(di0>=0)
+		{
+		pD += di0/ps*ps*sdd;
+		offsetD = di0%ps;
+		}
+	else
+		{
+		pD += -4*sdd;
+		offsetD = ps+di0;
+		}
+	
+	int ii, jj;
+
+	if(air!=0)
+		{
+		jj = 0;
+		for(; jj<n; jj+=4)
+			{
+			kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
+			}
+		m -= ps-air;
+		pA += ps*sda;
+		pD += ps*sdd;
+		}
+	ii = 0;
+	if(offsetD==0)
+		{
+#if defined(TARGET_X64_INTEL_HASWELL)
+		for(; ii<m-11; ii+=12)
+			{
+			jj = 0;
+			for(; jj<n-5; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
+				}
+			for(; jj<n; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
+//				kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
+//				kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[(ii+8)*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[(ii+8)*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+				}
+			}
+		if(ii<m)
+			{
+			if(ii<m-8)
+				goto left_12;
+			else if(ii<m-4)
+				goto left_8;
+			else
+				goto left_4_gen;
+			}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; ii<m-7; ii+=8)
+			{
+			jj = 0;
+			for(; jj<n-5; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
+				}
+			for(; jj<n; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 8, 0, n-jj);
+				}
+			}
+		if(ii<m)
+			{
+			if(ii<m-4)
+				goto left_8_gen;
+			else
+				goto left_4_gen;
+			}
+#else
+		for(; ii<m-3; ii+=4)
+			{
+			jj = 0;
+			for(; jj<n-5; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
+				}
+			for(; jj<n; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, 0, &pD[ii*sdd+jj*ps], sdd, 0, 4, 0, n-jj);
+				}
+			}
+		if(ii<m)
+			{
+			goto left_4_gen;
+			}
+#endif
+		}
+	else
+		{
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; ii<m-4; ii+=8)
+			{
+			jj = 0;
+			for(; jj<n; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+				}
+			}
+		if(ii<m)
+			{
+			goto left_4_gen;
+			}
+#else
+		for(; ii<m; ii+=4)
+			{
+			jj = 0;
+			for(; jj<n; jj+=4)
+				{
+				kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+				}
+			}
+#endif
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	jj = 0;
+	for(; jj<n; jj+=4)
+		{
+		kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	jj = 0;
+	for(; jj<n; jj+=4)
+		{
+		kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8_gen:
+	jj = 0;
+	for(; jj<n; jj+=4)
+		{
+		kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+		}
+	return;
+#endif
+
+	left_4_gen:
+	jj = 0;
+	for(; jj<n; jj+=4)
+		{
+		kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
+		}
+	return;
+
+	}
+
+
+
+void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+	
+	if(m<=0)
+		return;
+
+	if(ai!=0 | bi!=0)
+		{
+		printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	int i, j;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+	double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+	// TODO ai and bi
+	int offsetC;
+	int offsetD;
+	offsetC = ci&(ps-1);
+	offsetD = di&(ps-1);
+
+	// main loop
+	i = 0;
+	if(offsetC==0 & offsetD==0)
+		{
+#if defined(TARGET_X64_INTEL_HASWELL)
+		for(; i<m-11; i+=12)
+			{
+			j = 0;
+			for(; j<i; j+=4)
+				{
+				kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else if(m-i<=8)
+				{
+				goto left_8;
+				}
+			else
+				{
+				goto left_12;
+				}
+			}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-7; i+=8)
+			{
+			j = 0;
+			for(; j<i; j+=4)
+				{
+				kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+			kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else
+				{
+				goto left_8;
+				}
+			}
+#else
+		for(; i<m-3; i+=4)
+			{
+			j = 0;
+			for(; j<i; j+=4)
+				{
+				kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+				}
+			kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+			}
+		if(m>i)
+			{
+			goto left_4;
+			}
+#endif
+		}
+	else
+		{
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-4; i+=8)
+			{
+			j = 0;
+			for(; j<i; j+=4)
+				{
+				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+				}
+			kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+			kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
+			}
+		if(m>i)
+			{
+			goto left_4_gen;
+			}
+#else
+		for(; i<m; i+=4)
+			{
+			j = 0;
+			for(; j<i; j+=4)
+				{
+				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+				}
+			kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+			}
+#endif
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+		}
+	kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
+//	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<i-8; j+=12)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
+		}
+	if(j<i-4)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
+		j += 8;
+		}
+	else if(j<i)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+		j += 4;
+		}
+	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+//	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+		}
+	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
+	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<i-8; j+=12)
+		{
+		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+		}
+	if(j<i-4)
+		{
+		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+		j += 8;
+		}
+	else if(j<i)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+		j += 4;
+		}
+	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+		}
+	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
+	return;
+#endif
+
+	left_4_gen:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+		}
+	kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
+	return;
+
+	}
+
+
+
+void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+	
+	if(m<=0 | n<=0)
+		return;
+
+	if(ai!=0 | bi!=0)
+		{
+		printf("\ndsyrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	int i, j;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
+	double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
+
+	// TODO ai and bi
+	int offsetC;
+	int offsetD;
+	offsetC = ci&(ps-1);
+	offsetD = di&(ps-1);
+
+	// main loop
+	i = 0;
+	if(offsetC==0 & offsetD==0)
+		{
+#if defined(TARGET_X64_INTEL_HASWELL)
+		for(; i<m-11; i+=12)
+			{
+			j = 0;
+			for(; j<i & j<n-3; j+=4)
+				{
+				kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			if(j<n)
+				{
+				if(j<i) // dgemm
+					{
+					kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+					}
+				else // dsyrk
+					{
+					if(j<n-11)
+						{
+						kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+						kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
+						}
+					else
+						{
+						kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+						if(j<n-4)
+							{
+							kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+							if(j<n-8)
+								{
+								kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+								}
+							}
+						}
+					}
+				}
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else if(m-i<=8)
+				{
+				goto left_8;
+				}
+			else
+				{
+				goto left_12;
+				}
+			}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-7; i+=8)
+			{
+			j = 0;
+			for(; j<i & j<n-3; j+=4)
+				{
+				kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+				}
+			if(j<n)
+				{
+				if(j<i) // dgemm
+					{
+					kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+					}
+				else // dsyrk
+					{
+					if(j<n-7)
+						{
+						kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
+						kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
+						}
+					else
+						{
+						kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+						if(j<n-4)
+							{
+							kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+							}
+						}
+					}
+				}
+			}
+		if(m>i)
+			{
+			if(m-i<=4)
+				{
+				goto left_4;
+				}
+			else
+				{
+				goto left_8;
+				}
+			}
+#else
+		for(; i<m-3; i+=4)
+			{
+			j = 0;
+			for(; j<i & j<n-3; j+=4)
+				{
+				kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+				}
+			if(j<n)
+				{
+				if(i<j) // dgemm
+					{
+					kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+					}
+				else // dsyrk
+					{
+					if(j<n-3)
+						{
+						kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
+						}
+					else
+						{
+						kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+						}
+					}
+				}
+			}
+		if(m>i)
+			{
+			goto left_4;
+			}
+#endif
+		}
+	else
+		{
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; i<m-4; i+=8)
+			{
+			j = 0;
+			for(; j<i & j<n; j+=4)
+				{
+				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			if(j<n)
+				{
+				kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				if(j<n-4)
+					{
+					kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
+					}
+				}
+			}
+		if(m>i)
+			{
+			goto left_4_gen;
+			}
+#else
+		for(; i<m; i+=4)
+			{
+			j = 0;
+			for(; j<i & j<n; j+=4)
+				{
+				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			if(j<n)
+				{
+				kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+				}
+			}
+#endif
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
+			if(j<n-8)
+				{
+				kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
+				}
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<i-4 & j<n-4)
+		{
+		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
+		j += 8;
+		}
+	if(j<i & j<n)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+			}
+		}
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	if(j<i-4 & j<n-4)
+		{
+		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		j += 8;
+		}
+	else if(j<i & j<n)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
+		}
+	return;
+#endif
+
+	left_4_gen:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
+		}
+	return;
+
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/d_blas_64.h b/blas/d_blas_64.h
new file mode 100644
index 0000000..8e6aba2
--- /dev/null
+++ b/blas/d_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void dcopy_(long long *m, double *x, long long *incx, double *y, long long *incy);
+void daxpy_(long long *m, double *alpha, double *x, long long *incx, double *y, long long *incy);
+void dscal_(long long *m, double *alpha, double *x, long long *incx);
+
+// level 2
+void dgemv_(char *ta, long long *m, long long *n, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dsymv_(char *uplo, long long *m, double *alpha, double *A, long long *lda, double *x, long long *incx, double *beta, double *y, long long *incy);
+void dtrmv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dtrsv_(char *uplo, char *trans, char *diag, long long *n, double *A, long long *lda, double *x, long long *incx);
+void dger_(long long *m, long long *n, double *alpha, double *x, long long *incx, double *y, long long *incy, double *A, long long *lda);
+
+// level 3
+void dgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, double *alpha, double *A, long long *lda, double *B, long long *ldb, double *beta, double *C, long long *ldc);
+void dsyrk_(char *uplo, char *trans, long long *n, long long *k, double *alpha, double *A, long long *lda, double *beta, double *C, long long *ldc);
+void dtrmm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+void dtrsm_(char *side, char *uplo, char *trans, char *diag, long long *m, long long *n, double *alpha, double *A, long long *lda, double *B, long long *ldb);
+
+// lapack
+long long dpotrf_(char *uplo, long long *m, double *A, long long *lda, long long *info);
+long long dgetrf_(long long *m, long long *n, double *A, long long *lda, long long *ipiv, long long *info);
+void dgeqrf_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *lwork, long long *info);
+void dgeqr2_(long long *m, long long *n, double *A, long long *lda, double *tau, double *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/d_lapack_lib.c b/blas/d_lapack_lib.c
new file mode 100644
index 0000000..ce68c3d
--- /dev/null
+++ b/blas/d_lapack_lib.c
@@ -0,0 +1,75 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "d_blas_64.h"
+#else
+#include "d_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+
+
+
+#define REAL double
+
+#define STRMAT d_strmat
+#define STRVEC d_strvec
+
+#define GELQF_LIBSTR dgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR dgelqf_work_size_libstr
+#define GEQRF_LIBSTR dgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR dgeqrf_work_size_libstr
+#define GETF2_NOPIVOT dgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR dgetrf_nopivot_libstr
+#define GETRF_LIBSTR dgetrf_libstr
+#define POTRF_L_LIBSTR dpotrf_l_libstr
+#define POTRF_L_MN_LIBSTR dpotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR dsyrk_dpotrf_ln_libstr
+
+#define COPY dcopy_
+#define GELQF dgelqf_
+#define GEMM dgemm_
+#define GER dger_
+#define GEQRF dgeqrf_
+#define GEQR2 dgeqr2_
+#define GETRF dgetrf_
+#define POTRF dpotrf_
+#define SCAL dscal_
+#define SYRK dsyrk_
+#define TRSM dtrsm_
+
+
+#include "x_lapack_lib.c"
diff --git a/blas/d_lapack_lib4.c b/blas/d_lapack_lib4.c
new file mode 100644
index 0000000..75a4a4f
--- /dev/null
+++ b/blas/d_lapack_lib4.c
@@ -0,0 +1,2671 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+
+
+void dgetrf_nn_nopivot_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int ps = 4;
+
+	int ii, jj, ie;
+
+	// main loop
+	ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<m-11; ii+=12)
+		{
+		jj = 0;
+		// solve lower
+		ie = n<ii ? n : ii; // ie is multiple of 4
+		for( ; jj<ie-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ru_inv_12x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+			}
+		if(jj<ie)
+			{
+			kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+			jj+=4;
+			}
+		// factorize
+		if(jj<n-3)
+			{
+			kernel_dgetrf_nn_l_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+			jj+=4;
+			}
+		if(jj<n-3)
+			{
+			kernel_dgetrf_nn_m_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dgetrf_nn_m_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+			jj+=4;
+			}
+		if(jj<n-3)
+			{
+			kernel_dgetrf_nn_r_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+			jj+=4;
+			}
+		// solve upper
+		for( ; jj<n-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+			}
+		if(jj<n)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+			}
+		}
+	if(m>ii)
+		{
+		if(m-ii<=4)
+			{
+			goto left_4;
+			}
+		else if(m-ii<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for( ; ii<m-7; ii+=8)
+		{
+		jj = 0;
+		// solve lower
+		ie = n<ii ? n : ii; // ie is multiple of 4
+		for( ; jj<ie-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ru_inv_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+			}
+		if(jj<ie)
+			{
+			kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+			jj+=4;
+			}
+		// factorize
+		if(jj<n-3)
+			{
+			kernel_dgetrf_nn_l_8x4_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj]);
+//			kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+//			kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+//			kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+//			kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+			jj+=4;
+			}
+		if(jj<n-3)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+			kernel_dgetrf_nn_4x4_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+			kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+			jj+=4;
+			}
+		// solve upper
+		for( ; jj<n-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd],sdd,  &pD[ii*ps+ii*sdd], sdd);
+			}
+		if(jj<n)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+			}
+		}
+	if(m>ii)
+		{
+		if(m-ii<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for( ; ii<m-3; ii+=4)
+		{
+		jj = 0;
+		// solve lower
+		ie = n<ii ? n : ii; // ie is multiple of 4
+		for( ; jj<ie-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj]);
+			}
+		if(jj<ie)
+			{
+			kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+			jj+=4;
+			}
+		// factorize
+		if(jj<n-3)
+			{
+			kernel_dgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+			jj+=4;
+			}
+		// solve upper
+		for( ; jj<n-3; jj+=4)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+			}
+		if(jj<n)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+			}
+		}
+	if(m>ii)
+		{
+		goto left_4;
+		}
+
+#endif
+
+	// common return if i==m
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	jj = 0;
+	// solve lower
+	ie = n<ii ? n : ii; // ie is multiple of 4
+	for( ; jj<ie; jj+=4)
+		{
+		kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+		}
+	// factorize
+	if(jj<n)
+		{
+		kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+		jj+=4;
+		}
+	if(jj<n)
+		{
+		kernel_dgetrf_nn_l_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+		jj+=4;
+		}
+	if(jj<n)
+		{
+		kernel_dgetrf_nn_r_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+		jj+=4;
+		}
+	// solve upper
+	for( ; jj<n; jj+=4)
+		{
+		kernel_dtrsm_nn_ll_one_12x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+		}
+	return;
+
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	jj = 0;
+	// solve lower
+	ie = n<ii ? n : ii; // ie is multiple of 4
+	for( ; jj<ie; jj+=4)
+		{
+		kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+		}
+	// factorize
+	if(jj<n)
+		{
+		kernel_dgetrf_nn_l_8x4_vs_lib4(jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &inv_diag_D[jj], m-ii, n-jj);
+//		kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+//		kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+		jj+=4;
+		}
+	if(jj<n)
+		{
+		kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+		kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[(ii+4)*sdd], &pD[jj*ps], sdd, &pC[jj*ps+(ii+4)*sdc], &pD[jj*ps+(ii+4)*sdd], &inv_diag_D[jj], m-(ii+4), n-jj);
+		jj+=4;
+		}
+	// solve upper
+	for( ; jj<n; jj+=4)
+		{
+		kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], sdc, &pD[jj*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, m-ii, n-jj);
+		}
+	return;
+
+#endif
+
+	left_4:
+	jj = 0;
+	// solve lower
+	ie = n<ii ? n : ii; // ie is multiple of 4
+	for( ; jj<ie; jj+=4)
+		{
+		kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[jj*ps+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+		}
+	// factorize
+	if(jj<n)
+		{
+		kernel_dgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+		jj+=4;
+		}
+	// solve upper
+	for( ; jj<n; jj+=4)
+		{
+		kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*ps], sdd, &pC[jj*ps+ii*sdc], &pD[jj*ps+ii*sdd], &pD[ii*ps+ii*sdd], m-ii, n-jj);
+		}
+	return;
+
+	}
+
+
+
+void dgetrf_nn_lib(int m, int n, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D, int *ipiv)
+	{
+
+	if(m<=0)
+		return;
+
+	const int ps = 4;
+
+	int ii, jj, i0, i1, j0, ll, p;
+
+	double d1 = 1.0;
+	double dm1 = -1.0;
+
+	// needs to perform row-excanges on the yet-to-be-factorized matrix too
+	if(pC!=pD)
+		dgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+	// minimum matrix size
+	p = n<m ? n : m; // XXX
+
+	// main loop
+#if defined(TARGET_X64_INTEL_HASWELL)
+	// 12 columns at a time
+	jj = 0;
+	for(; jj<p-11; jj+=12)
+		{
+		// pivot & factorize & solve lower
+		// left block-column
+		ii = jj;
+		i0 = ii;
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+		kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+		ipiv[i0+0] += i0;
+		if(ipiv[i0+0]!=i0+0)
+			{
+			drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+2] += i0;
+		if(ipiv[i0+2]!=i0+2)
+			{
+			drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+3] += i0;
+		if(ipiv[i0+3]!=i0+3)
+			{
+			drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+			}
+		// middle block-column
+		ii = i0;
+		kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+		ii += 4;
+		i1 = ii;
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+		kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+		ipiv[i1+0] += i1;
+		if(ipiv[i1+0]!=i1+0)
+			{
+			drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+			}
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+			}
+		ipiv[i1+2] += i1;
+		if(ipiv[i1+2]!=i1+2)
+			{
+			drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+			}
+		ipiv[i1+3] += i1;
+		if(ipiv[i1+3]!=i1+3)
+			{
+			drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+			}
+		// right block-column
+		ii = i0;
+		kernel_dtrsm_nn_ll_one_8x4_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd);
+		ii += 8;
+		i1 = ii;
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+		kernel_dgetrf_pivot_4_lib4(m-i1, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+		ipiv[i1+0] += i1;
+		if(ipiv[i1+0]!=i1+0)
+			{
+			drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+			}
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+			}
+		ipiv[i1+2] += i1;
+		if(ipiv[i1+2]!=i1+2)
+			{
+			drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+			}
+		ipiv[i1+3] += i1;
+		if(ipiv[i1+3]!=i1+3)
+			{
+			drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+			}
+
+		// solve upper
+//		i0 -= 8; // 4 ???
+		ll = jj+12;
+		for( ; ll<n-3; ll+=4)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+			}
+		if(ll<n)
+			{
+			kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 12, n-ll);
+			}
+		}
+	if(m>=n)
+		{
+		if(n-jj>0)
+			{
+			if(n-jj<=4)
+				goto left_n_4;
+			else if(n-jj<=8)
+				goto left_n_8;
+			else
+				goto left_n_12;
+			}
+		}
+	else // n>m
+		{
+		if(m-jj>0)
+			{
+			if(m-jj<=4)
+				goto left_m_4;
+			else if(m-jj<=8)
+				goto left_m_8;
+			else
+				goto left_m_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	// 8 columns at a time
+	jj = 0;
+	for(; jj<p-7; jj+=8)
+		{
+		// pivot & factorize & solve lower
+		// left block-column
+		ii = jj;
+		i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#else // SANDY_BRIDGE
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#endif
+		kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+		ipiv[i0+0] += i0;
+		if(ipiv[i0+0]!=i0+0)
+			{
+			drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+2] += i0;
+		if(ipiv[i0+2]!=i0+2)
+			{
+			drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+3] += i0;
+		if(ipiv[i0+3]!=i0+3)
+			{
+			drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+			}
+		// right block-column
+		ii = i0;
+		kernel_dtrsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd]);
+		ii += 4;
+		i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#else // SANDY_BRIDGE
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgemm_nn_8x4_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#endif
+		kernel_dgetrf_pivot_4_lib4(m-i0, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+		ipiv[i0+0] += i0;
+		if(ipiv[i0+0]!=i0+0)
+			{
+			drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+			}
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+			}
+		ipiv[i0+2] += i0;
+		if(ipiv[i0+2]!=i0+2)
+			{
+			drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+			}
+		ipiv[i0+3] += i0;
+		if(ipiv[i0+3]!=i0+3)
+			{
+			drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+			}
+
+		// solve upper
+		i0 -= 4;
+		ll = jj+8;
+		for( ; ll<n-3; ll+=4)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd);
+			}
+		if(ll<n)
+			{
+			kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, 8, n-ll);
+			}
+		}
+	if(m>=n)
+		{
+		if(n-jj>0)
+			{
+			if(n-jj<=4) // (m>=1 && n==1) || (m>=2 && n==2) || m>=3 && n==3
+				{
+				goto left_n_4;
+				}
+			else // (m>=5 && n==5) || (m>=6 && n==6) || (m>=7 && n==7)
+				goto left_n_8;
+			}
+		}
+	else // n>m
+		{
+		if(m-jj>0)
+			{
+			if(m-jj<=4) // (m==1 && n>=2) || (m==2 && n>=3) || (m==3 && n>=4) || (m==4 && n>=5)
+				goto left_m_4;
+			else // (m==5 && n>=6) || (m==6 && n>=7) || (m==7 && n>=8)
+				{
+				goto left_m_8;
+				}
+			}
+		}
+#else
+	// 4 columns at a time
+	jj = 0;
+	for(; jj<p-3; jj+=4) // XXX
+		{
+		// pivot & factorize & solve lower
+		ii = jj;
+		i0 = ii;
+#if defined(TARGET_X64_INTEL_HASWELL) // XXX
+		for( ; ii<m-11; ii+=12)
+			{
+			kernel_dgemm_nn_12x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>8)
+				{
+				kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+				}
+			else if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) // XXX
+		for( ; ii<m-7; ii+=8)
+			{
+			kernel_dgemm_nn_8x4_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd);
+			}
+		if(m-ii>0)
+			{
+			if(m-ii>4)
+				{
+				kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			else
+				{
+				kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+				}
+			}
+#else
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_dgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd]);
+			}
+		if(m-ii>0)
+			{
+			kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+			}
+#endif
+		kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+		ipiv[i0+0] += i0;
+		if(ipiv[i0+0]!=i0+0)
+			{
+			drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+2] += i0;
+		if(ipiv[i0+2]!=i0+2)
+			{
+			drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+			}
+		ipiv[i0+3] += i0;
+		if(ipiv[i0+3]!=i0+3)
+			{
+			drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+			}
+
+		// solve upper
+		ll = jj+4;
+		for( ; ll<n-3; ll+=4)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd]);
+			}
+		if(n-ll>0)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], 4, n-ll);
+			}
+		}
+	if(m>=n)
+		{
+		if(n-jj>0)
+			{
+			goto left_n_4;
+			}
+		}
+	else
+		{
+		if(m-jj>0)
+			{
+			goto left_m_4;
+			}
+		}
+#endif
+
+	// common return if jj==n
+	return;
+
+
+	// clean up
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_n_12:
+	// 9-12 columns at a time
+	// pivot & factorize & solve lower
+	// left block-column
+	ii = jj;
+	i0 = ii;
+	for( ; ii<m-8; ii+=12)
+		{
+		kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+		}
+	if(m-ii>4)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+//		ii+=8;
+		}
+	else if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+//		ii+=4;
+		}
+	kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+1] += i0;
+	if(ipiv[i0+1]!=i0+1)
+		{
+		drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+2] += i0;
+	if(ipiv[i0+2]!=i0+2)
+		{
+		drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+3] += i0;
+	if(ipiv[i0+3]!=i0+3)
+		{
+		drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+		}
+	// middle block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+	ii += 4;
+	i1 = ii;
+	for( ; ii<m-8; ii+=12)
+		{
+		kernel_dgemm_nn_12x4_vs_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, &pD[(jj+4)*ps], sdd, &d1, &pD[(jj+4)*ps+ii*sdd], sdd, &pD[(jj+4)*ps+ii*sdd], sdd, m-ii, n-jj-4);
+		}
+	if(m-ii>4)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+		}
+	else if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+		}
+	kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+	ipiv[i1+0] += i1;
+	if(ipiv[i1+0]!=i1+0)
+		{
+		drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+		drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+		}
+	if(n-jj-4>1)
+		{
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+			}
+		if(n-jj-4>2)
+			{
+			ipiv[i1+2] += i1;
+			if(ipiv[i1+2]!=i1+2)
+				{
+				drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+				drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+				}
+			if(n-jj-4>3)
+				{
+				ipiv[i1+3] += i1;
+				if(ipiv[i1+3]!=i1+3)
+					{
+					drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+					drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+					}
+				}
+			}
+		}
+	// right block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+	ii += 8;
+	i1 = ii;
+	for( ; ii<m-8; ii+=12)
+		{
+		kernel_dgemm_nn_12x4_vs_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &d1, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, m-ii, n-jj-8);
+		}
+	if(m-ii>4)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+		}
+	else if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+		}
+	kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+	ipiv[i1+0] += i1;
+	if(ipiv[i1+0]!=i1+0)
+		{
+		drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+		drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+		}
+	if(n-jj-8>1)
+		{
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+			}
+		if(n-jj-8>2)
+			{
+			ipiv[i1+2] += i1;
+			if(ipiv[i1+2]!=i1+2)
+				{
+				drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+				drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+				}
+			if(n-jj-8>3)
+				{
+				ipiv[i1+3] += i1;
+				if(ipiv[i1+3]!=i1+3)
+					{
+					drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+					drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	// there is no upper
+	return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_m_12:
+	// 9-12 rows at a time
+	// pivot & factorize & solve lower
+	// left block-column
+	ii = jj;
+	i0 = ii;
+	kernel_dgemm_nn_12x4_vs_lib4(jj, &dm1, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, &d1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, m-ii, 4);
+	kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+1] += i0;
+	if(ipiv[i0+1]!=i0+1)
+		{
+		drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+2] += i0;
+	if(ipiv[i0+2]!=i0+2)
+		{
+		drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+3] += i0;
+	if(ipiv[i0+3]!=i0+3)
+		{
+		drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+		}
+	// middle block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+	ii += 4;
+	i1 = ii;
+	kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+	kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-4, &pD[(jj+4)*ps+i1*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i1]);
+	ipiv[i1+0] += i1;
+	if(ipiv[i1+0]!=i1+0)
+		{
+		drowsw_lib(jj+4, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+		drowsw_lib(n-jj-8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+8)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+8)*ps);
+		}
+	if(m-jj-4>1)
+		{
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+4, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+8)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+8)*ps);
+			}
+		if(m-jj-4>2)
+			{
+			ipiv[i1+2] += i1;
+			if(ipiv[i1+2]!=i1+2)
+				{
+				drowsw_lib(jj+4, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+				drowsw_lib(n-jj-8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+8)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+8)*ps);
+				}
+			if(m-jj-4>3)
+				{
+				ipiv[i1+3] += i1;
+				if(ipiv[i1+3]!=i1+3)
+					{
+					drowsw_lib(jj+4, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+					drowsw_lib(n-jj-8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+8)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+8)*ps);
+					}
+				}
+			}
+		}
+	// right block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_8x4_vs_lib4(ii, &pD[ii*sdd], sdd, &pD[(jj+8)*ps], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[(jj+8)*ps+ii*sdd], sdd, &pD[ii*ps+ii*sdd], sdd, 8, n-jj-8);
+	ii += 8;
+	i1 = ii;
+	kernel_dgemm_nn_4x4_gen_lib4((jj+8), &dm1, &pD[ii*sdd], 0, &pD[(jj+8)*ps], sdd, &d1, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, &pD[(jj+8)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-8);
+	kernel_dgetrf_pivot_4_vs_lib4(m-i1, n-jj-8, &pD[(jj+8)*ps+i1*sdd], sdd, &inv_diag_D[(jj+8)], &ipiv[i1]);
+	ipiv[i1+0] += i1;
+	if(ipiv[i1+0]!=i1+0)
+		{
+		drowsw_lib(jj+8, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps);
+		drowsw_lib(n-jj-12, pD+(i1+0)/ps*ps*sdd+(i1+0)%ps+(jj+12)*ps, pD+(ipiv[i1+0])/ps*ps*sdd+(ipiv[i1+0])%ps+(jj+12)*ps);
+		}
+	if(m-jj-8>1)
+		{
+		ipiv[i1+1] += i1;
+		if(ipiv[i1+1]!=i1+1)
+			{
+			drowsw_lib(jj+8, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps);
+			drowsw_lib(n-jj-12, pD+(i1+1)/ps*ps*sdd+(i1+1)%ps+(jj+12)*ps, pD+(ipiv[i1+1])/ps*ps*sdd+(ipiv[i1+1])%ps+(jj+12)*ps);
+			}
+		if(m-jj-8>2)
+			{
+			ipiv[i1+2] += i1;
+			if(ipiv[i1+2]!=i1+2)
+				{
+				drowsw_lib(jj+8, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps);
+				drowsw_lib(n-jj-12, pD+(i1+2)/ps*ps*sdd+(i1+2)%ps+(jj+12)*ps, pD+(ipiv[i1+2])/ps*ps*sdd+(ipiv[i1+2])%ps+(jj+12)*ps);
+				}
+			if(m-jj-8>3)
+				{
+				ipiv[i1+3] += i1;
+				if(ipiv[i1+3]!=i1+3)
+					{
+					drowsw_lib(jj+8, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps);
+					drowsw_lib(n-jj-12, pD+(i1+3)/ps*ps*sdd+(i1+3)%ps+(jj+12)*ps, pD+(ipiv[i1+3])/ps*ps*sdd+(ipiv[i1+3])%ps+(jj+12)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+//	i0 -= 8;
+	ll = jj+12;
+	for( ; ll<n; ll+=4)
+		{
+		kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+		}
+	return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_n_8:
+	// 5-8 columns at a time
+	// pivot & factorize & solve lower
+	// left block-column
+	ii = jj;
+	i0 = ii;
+	for( ; ii<m-4; ii+=8)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+		}
+	if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+//		ii+=4;
+		}
+	kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+1] += i0;
+	if(ipiv[i0+1]!=i0+1)
+		{
+		drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+2] += i0;
+	if(ipiv[i0+2]!=i0+2)
+		{
+		drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+3] += i0;
+	if(ipiv[i0+3]!=i0+3)
+		{
+		drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+		}
+	// right block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+	ii += 4;
+	i0 = ii;
+	for( ; ii<m-4; ii+=8)
+		{
+		kernel_dgemm_nn_8x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], sdd, 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], 0, sdd, m-ii, 0, n-jj-4);
+		}
+	if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+		}
+	kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+		}
+	if(n-jj-4>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+			}
+		if(n-jj-4>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+				drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+				}
+			if(n-jj-4>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+					drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	// there is no upper
+	return;
+#endif
+
+
+#if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_m_8:
+	// 5-8 rows at a time
+	// pivot & factorize & solve lower
+	// left block-column
+	ii = jj;
+	i0 = ii;
+	kernel_dgemm_nn_8x4_gen_lib4(jj, &dm1, &pD[ii*sdd], sdd, 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, 4);
+	kernel_dgetrf_pivot_4_lib4(m-i0, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+1] += i0;
+	if(ipiv[i0+1]!=i0+1)
+		{
+		drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+2] += i0;
+	if(ipiv[i0+2]!=i0+2)
+		{
+		drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+		}
+	ipiv[i0+3] += i0;
+	if(ipiv[i0+3]!=i0+3)
+		{
+		drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+		}
+	// right block-column
+	ii = i0;
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[(jj+4)*ps], sdd, &pD[(jj+4)*ps+ii*sdd], &pD[(jj+4)*ps+ii*sdd], &pD[ii*ps+ii*sdd], 4, n-jj-4);
+	ii += 4;
+	i0 = ii;
+	kernel_dgemm_nn_4x4_gen_lib4((jj+4), &dm1, &pD[ii*sdd], 0, &pD[(jj+4)*ps], sdd, &d1, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, &pD[(jj+4)*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj-4);
+	kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj-4, &pD[(jj+4)*ps+i0*sdd], sdd, &inv_diag_D[(jj+4)], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj+4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-8, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+8)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+8)*ps);
+		}
+	if(m-jj-4>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj+4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-8, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+8)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+8)*ps);
+			}
+		if(m-jj-4>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				drowsw_lib(jj+4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+				drowsw_lib(n-jj-8, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+8)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+8)*ps);
+				}
+			if(m-jj-4>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					drowsw_lib(jj+4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+					drowsw_lib(n-jj-8, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+8)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+8)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	i0 -= 4;
+	ll = jj+8;
+	for( ; ll<n; ll+=4)
+		{
+		kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i0, &pD[i0*sdd], sdd, &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[ll*ps+i0*sdd], sdd, &pD[i0*ps+i0*sdd], sdd, m-i0, n-ll);
+		}
+	return;
+#endif
+
+
+	left_n_4:
+	// 1-4 columns at a time
+	// pivot & factorize & solve lower
+	ii = jj;
+	i0 = ii;
+#if 0//defined(TARGET_X64_AVX2) || defined(TARGET_X64_AVX)
+	for( ; ii<m-4; ii+=8)
+		{
+		kernel_dgemm_nn_8x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], sdd, &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], sdd, &pD[jj*ps+ii*sdd], sdd, 0, 0);
+		}
+	if(m-ii>0)
+		{
+		kernel_dgemm_nn_4x4_vs_lib4(m-ii, n-jj, jj, &pD[ii*sdd], &pD[jj*ps], sdd, -1, &pD[jj*ps+ii*sdd], &pD[jj*ps+ii*sdd], 0, 0);
+//		ii+=4;
+		}
+#else
+	for( ; ii<m; ii+=4)
+		{
+		kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+		}
+#endif
+	kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	if(n-jj>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+			}
+		if(n-jj>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+				drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+				}
+			if(n-jj>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+					drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	if(0) // there is no upper
+		{
+		ll = jj+4;
+		for( ; ll<n; ll+=4)
+			{
+			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+			}
+		}
+	return;
+
+
+	left_m_4:
+	// 1-4 rows at a time
+	// pivot & factorize & solve lower
+	ii = jj;
+	i0 = ii;
+	kernel_dgemm_nn_4x4_gen_lib4(jj, &dm1, &pD[ii*sdd], 0, &pD[jj*ps], sdd, &d1, 0, &pD[jj*ps+ii*sdd], sdd, 0, &pD[jj*ps+ii*sdd], sdd, 0, m-ii, 0, n-jj);
+	kernel_dgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*ps+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		drowsw_lib(jj, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps);
+		drowsw_lib(n-jj-4, pD+(i0+0)/ps*ps*sdd+(i0+0)%ps+(jj+4)*ps, pD+(ipiv[i0+0])/ps*ps*sdd+(ipiv[i0+0])%ps+(jj+4)*ps);
+		}
+	if(m-i0>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			drowsw_lib(jj, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps);
+			drowsw_lib(n-jj-4, pD+(i0+1)/ps*ps*sdd+(i0+1)%ps+(jj+4)*ps, pD+(ipiv[i0+1])/ps*ps*sdd+(ipiv[i0+1])%ps+(jj+4)*ps);
+			}
+		if(m-i0>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				drowsw_lib(jj, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps);
+				drowsw_lib(n-jj-4, pD+(i0+2)/ps*ps*sdd+(i0+2)%ps+(jj+4)*ps, pD+(ipiv[i0+2])/ps*ps*sdd+(ipiv[i0+2])%ps+(jj+4)*ps);
+				}
+			if(m-i0>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					drowsw_lib(jj, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps);
+					drowsw_lib(n-jj-4, pD+(i0+3)/ps*ps*sdd+(i0+3)%ps+(jj+4)*ps, pD+(ipiv[i0+3])/ps*ps*sdd+(ipiv[i0+3])%ps+(jj+4)*ps);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	ll = jj+4;
+	for( ; ll<n; ll+=4)
+		{
+		kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*ps], sdd, &pD[ll*ps+i0*sdd], &pD[ll*ps+i0*sdd], &pD[i0*ps+i0*sdd], m-i0, n-ll);
+		}
+	return;
+
+	}
+
+
+# if 0
+void dlauum_dpotrf_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd, double *inv_diag_D)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	// TODO remove
+	int k = cv[nv-1];
+
+	const int ps = 4;
+
+	int i, j, l;
+	int ii, iii, jj, kii, kiii, kjj, k0, k1;
+
+	i = 0;
+	ii = 0;
+	iii = 0;
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-7; i+=8)
+		{
+
+		while(ii<nv && rv[ii]<i+8)
+			ii++;
+		if(ii<nv)
+			kii = cv[ii];
+		else
+			kii = cv[ii-1];
+
+		j = 0;
+		jj = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j]);
+			}
+		if(j<n)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			if(j<i) // dgemm
+				{
+				kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], 8, n-j);
+				}
+			else // dsyrk
+				{
+				kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], 8, n-j);
+				if(j<n-4)
+					{
+					kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], 4, n-j-4); // TODO
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+
+		while(ii<nv && rv[ii]<i+4)
+			ii++;
+		if(ii<nv)
+			kii = cv[ii];
+		else
+			kii = cv[ii-1];
+
+		j = 0;
+		jj = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j]);
+			}
+		if(j<n)
+			{
+
+			while(jj<nv && rv[jj]<j+4)
+				jj++;
+			if(jj<nv)
+				kjj = cv[jj];
+			else
+				kjj = cv[jj-1];
+			k0 = kii<kjj ? kii : kjj;
+
+			if(i<j) // dgemm
+				{
+				kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+				}
+			else // dsyrk
+				{
+				kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], 4, n-j);
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+
+	kii = cv[nv-1];
+
+	j = 0;
+	jj = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		if(j<i) // dgemm
+			{
+			kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+			}
+		else // dsyrk
+			{
+			kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k0, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], alg, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &inv_diag_D[j], m-i, n-j);
+			if(j<n-4)
+				{
+				kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], alg, &pC[(j+4)*ps+(j+4)*sdc], &pD[(j+4)*ps+(j+4)*sdd], &inv_diag_D[j+4], m-i-4, n-j-4); // TODO
+				}
+			}
+		}
+	return;
+#endif
+
+	left_4:
+
+	kii = cv[nv-1];
+
+	j = 0;
+	jj = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+
+		while(jj<nv && rv[jj]<j+4)
+			jj++;
+		if(jj<nv)
+			kjj = cv[jj];
+		else
+			kjj = cv[jj-1];
+		k0 = kii<kjj ? kii : kjj;
+
+		if(j<i) // dgemm
+			{
+			kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+			}
+		else // dsyrk
+			{
+			kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k0, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], alg, &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &inv_diag_D[j], m-i, n-j);
+			}
+		}
+	return;
+
+	}
+#endif
+
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ci!=0 | di!=0)
+		{
+		printf("\ndpotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dD = sD->dA;
+
+	if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	int i, j, l;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<i; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+		kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<i; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+		kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+			}
+		kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12: // 9 - 12
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+		}
+	kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+	kernel_dpotrf_nt_l_8x8_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, m-j-4);
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<i-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+		kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, m-(j+4));
+		}
+	if(j<i-4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, m-(j+4));
+		j += 8;
+		}
+	else if(j<i)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+		j += 4;
+		}
+	kernel_dpotrf_nt_l_8x8_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+	return;
+#endif
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+		}
+	kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+	kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, m-j-4);
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<i-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+		}
+	if(j<i-4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, m-j);
+		j += 8;
+		}
+	else if(j<i)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+		j += 4;
+		}
+	kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+		}
+	kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, m-j);
+	return;
+#endif
+
+	}
+
+
+
+// dpotrf
+void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	if(ci!=0 | di!=0)
+		{
+		printf("\ndpotrf_l_mn_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dD = sD->dA;
+
+	if(di==0 & dj==0) // XXX what to do if di and dj are not zero
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	int i, j, l;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(j<i) // dtrsm
+				{
+				kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dpptrf
+				{
+				if(n<j-11)
+					{
+					kernel_dpotrf_nt_l_12x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+					kernel_dpotrf_nt_l_8x8_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+					}
+				else
+					{
+					kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+					if(j<n-4)
+						{
+						kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+						if(j<n-8)
+							{
+							kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+							}
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(j<i) // dtrsm
+				{
+				kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dpotrf
+				{
+				if(j<n-7)
+//				if(0)
+					{
+					kernel_dpotrf_nt_l_8x4_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+					kernel_dpotrf_nt_l_4x4_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+					}
+				else
+					{
+					kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+					if(j<n-4)
+						{
+						kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dpotrf
+				{
+				if(j<n-3)
+					{
+					kernel_dpotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+					}
+				else
+					{
+					kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dpotrf_nt_l_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dpotrf_nt_l_8x4_vs_lib4(j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+			if(j<n-8)
+				{
+				kernel_dpotrf_nt_l_4x4_vs_lib4(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+				}
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+		}
+	if(j<i-4 & j<n-4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+		j += 8;
+		}
+	else if(j<i & j<n)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+			}
+		}
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dpotrf_nt_l_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dpotrf_nt_l_4x4_vs_lib4(j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		}
+	if(j<i-4 & j<n-4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		j += 8;
+		}
+	else if(j<i & j<n)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	return;
+#else
+	left_4:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dpotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	return;
+#endif
+
+	}
+
+
+
+// dsyrk dpotrf
+void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\ndsyrk_dpotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+
+	const int ps = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pA = sA->pA + aj*ps;
+	double *pB = sB->pA + bj*ps;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(j<i) // dgemm
+				{
+				kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dsyrk
+				{
+				if(j<n-11)
+					{
+					kernel_dsyrk_dpotrf_nt_l_12x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+					kernel_dsyrk_dpotrf_nt_l_8x8_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4]);
+					}
+				else
+					{
+					kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+					if(j<n-4)
+						{
+						if(j<n-8)
+							{
+							kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+							}
+						else
+							{
+							kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+							}
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_12;
+			}
+		}
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(j<i) // dgemm
+				{
+				kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dsyrk
+				{
+				if(j<n-7)
+//				if(0)
+					{
+					kernel_dsyrk_dpotrf_nt_l_8x4_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j]);
+					kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4]);
+					}
+				else
+					{
+					kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+					if(j<n-4)
+						{
+						kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else
+			{
+			goto left_8;
+			}
+		}
+#else
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i & j<n-3; j+=4)
+			{
+			kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dgemm
+				{
+				kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dsyrk
+				{
+				if(j<n-3)
+					{
+					kernel_dsyrk_dpotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j]);
+					}
+				else
+					{
+					kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], sdd, &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, &dD[j+4], m-i-4, n-j-4);
+			if(j<n-8)
+				{
+				kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], &dD[j+8], m-i-8, n-j-8);
+				}
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_8:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(k, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, (j+4), &pD[i*sdd], sdd, &pD[(j+4)*sdd], sdd, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, &pD[(j+4)*ps+(j+4)*sdd], sdd, &dD[(j+4)], m-i, n-(j+4));
+		}
+	if(j<i-3 & j<n-3)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], sdb, j, &pD[i*sdd], sdd, &pD[j*sdd], sdd, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[(j+4)*sdb], (j+4), &pD[i*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], &pD[(j+4)*ps+(j+4)*sdd], &dD[(j+4)], m-i, n-(j+4));
+		j += 8;
+		}
+	else if(j<i & j<n)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+			}
+		}
+	return;
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_8:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(k, &pA[i*sda], sda, &pB[j*sdb], j, &pD[i*sdd], sdd, &pD[j*sdd], &pC[j*ps+j*sdc], sdc, &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		if(j<n-4)
+			{
+			kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[(i+4)*sda], &pB[(j+4)*sdb], j+4, &pD[(i+4)*sdd], &pD[(j+4)*sdd], &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], &dD[j+4], m-i-4, n-j-4);
+			}
+		}
+	return;
+#endif
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4:
+	j = 0;
+	for(; j<i-8 & j<n-8; j+=12)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		}
+	if(j<i-4 & j<n-4)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(k, &pA[i*sda], &pB[j*sdb], sdb, j, &pD[i*sdd], &pD[j*sdd], sdd, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], sdd, &dD[j], m-i, n-j);
+		j += 8;
+		}
+	else if(j<i & j<n)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		j += 4;
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+#else
+	left_4:
+	j = 0;
+	for(; j<i & j<n; j+=4)
+		{
+		kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*ps+j*sdc], &pD[j*ps+j*sdd], &dD[j], m-i, n-j);
+		}
+#endif
+
+	return;
+
+	}
+
+
+
+// dgetrf without pivoting
+void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj)
+	{
+	if(ci!=0 | di!=0)
+		{
+		printf("\ndgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+	const int ps = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dD = sD->dA; // XXX what to do if di and dj are not zero
+	dgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	return;
+	}
+
+
+
+
+// dgetrf pivoting
+void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv)
+	{
+	if(ci!=0 | di!=0)
+		{
+		printf("\ndgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+	const int ps = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = sC->pA + cj*ps;
+	double *pD = sD->pA + dj*ps;
+	double *dD = sD->dA; // XXX what to do if di and dj are not zero
+	dgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	return;
+	}
+
+
+
+int dgeqrf_work_size_libstr(int m, int n)
+	{
+	const int ps = 4;
+	int cm = (m+ps-1)/ps*ps;
+	int cn = (n+ps-1)/ps*ps;
+	return ps*(cm+cn)*sizeof(double);
+//	return 0;
+	}
+
+
+
+void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *v_work)
+	{
+	char *work = (char *) v_work;
+	if(m<=0 | n<=0)
+		return;
+	const int ps = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+	double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+	double *dD = sD->dA + di;
+	int cm = (m+ps-1)/ps*ps;
+	int cn = (n+ps-1)/ps*ps;
+	double *pVt = (double *) work;
+	work += ps*cm*sizeof(double);
+	double *pW = (double *) work;
+	work += ps*cn*sizeof(double);
+	if(pC!=pD)
+		dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+	int ii;
+	int imax0 = (ps-(di&(ps-1)))&(ps-1);
+	int imax = m<n ? m : n;
+	imax0 = imax<imax0 ? imax : imax0;
+	if(imax0>0)
+		{
+		kernel_dgeqrf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+		pD += imax0-ps+ps*sdd+imax0*ps;
+		dD += imax0;
+		m -= imax0;
+		n -= imax0;
+		imax -= imax0;
+		}
+	for(ii=0; ii<imax-3; ii+=4)
+		{
+		kernel_dgeqrf_4_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, dD+ii);
+#if 0
+		kernel_dlarf_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd);
+#else
+		kernel_dgetr_4_0_lib4(m-ii, pD+ii*sdd+ii*ps, sdd, pVt);
+		pVt[0+ps*0] = 1.0;
+		pVt[1+ps*0] = 0.0;
+		pVt[2+ps*0] = 0.0;
+		pVt[3+ps*0] = 0.0;
+		pVt[1+ps*1] = 1.0;
+		pVt[2+ps*1] = 0.0;
+		pVt[3+ps*1] = 0.0;
+		pVt[2+ps*2] = 1.0;
+		pVt[3+ps*2] = 0.0;
+		pVt[3+ps*3] = 1.0;
+		kernel_dlarf_t_4_lib4(m-ii, n-ii-4, pD+ii*sdd+ii*ps, sdd, pVt, dD+ii, pD+ii*sdd+(ii+4)*ps, sdd, pW);
+#endif
+		}
+	if(ii<imax)
+		{
+		kernel_dgeqrf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+		}
+	return;
+	}
+
+
+
+int dgelqf_work_size_libstr(int m, int n)
+	{
+	return 0;
+	}
+
+
+
+void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	const int ps = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	double *pC = &(DMATEL_LIBSTR(sC,ci,cj));
+	double *pD = &(DMATEL_LIBSTR(sD,di,dj));
+	double *dD = sD->dA + di;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	double pT[144] __attribute__ ((aligned (64))) = {0};
+	double pK[96] __attribute__ ((aligned (64))) = {0};
+#else
+	double pT[144] = {0};
+	double pK[96] = {0};
+#endif
+	if(pC!=pD)
+		dgecp_lib(m, n, 1.0, ci&(ps-1), pC, sdc, di&(ps-1), pD, sdd);
+	int ii, jj, ll;
+	int imax0 = (ps-(di&(ps-1)))&(ps-1);
+	int imax = m<n ? m : n;
+#if 0
+	kernel_dgelqf_vs_lib4(m, n, imax, di&(ps-1), pD, sdd, dD);
+#else
+	imax0 = imax<imax0 ? imax : imax0;
+	if(imax0>0)
+		{
+		kernel_dgelqf_vs_lib4(m, n, imax0, di&(ps-1), pD, sdd, dD);
+		pD += imax0-ps+ps*sdd+imax0*ps;
+		dD += imax0;
+		m -= imax0;
+		n -= imax0;
+		imax -= imax0;
+		}
+	ii = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+//	for(; ii<imax-11; ii+=12)
+	for(; ii<imax-127; ii+=12) // crossover point ~ ii=128
+		{
+		kernel_dgelqf_dlarft12_12_lib4(n-(ii+0), pD+(ii+0)*sdd+(ii+0)*ps, sdd, dD+(ii+0), &pT[0+0*12+0*ps]);
+		jj = ii+12;
+		for(; jj<m; jj+=4)
+			{
+			kernel_dlarfb12_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, pT, pD+jj*sdd+ii*ps, pK, m-jj);
+			}
+		}
+	for(; ii<imax-11; ii+=4)
+		{
+		kernel_dgelqf_dlarft4_12_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+		jj = ii+12;
+		for(; jj<m-11; jj+=12)
+			{
+			kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+			}
+		for(; jj<m-7; jj+=8)
+			{
+			kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+			}
+		for(; jj<m-3; jj+=4)
+			{
+			kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+			}
+		}
+	// 8 9 10 11
+	if(ii<imax-7)
+		{
+		kernel_dgelqf_dlarft4_8_lib4(n-ii, pD+ii*sdd+ii*ps, sdd, dD+ii, pT);
+		jj = ii+8;
+		if(jj<m)
+			{
+			for(; jj<m-11; jj+=12)
+				{
+				kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+				}
+			for(; jj<m-7; jj+=8)
+				{
+				kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+				}
+			for(; jj<m-3; jj+=4)
+				{
+				kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+				}
+			for(ll=0; ll<m-jj; ll++)
+				{
+				kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+				}
+			}
+		ii += 4;
+		}
+	// 4 5 6 7
+	if(ii<imax-3)
+		{
+		kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+		jj = ii+4;
+		if(jj<m)
+			{
+			for(; jj<m-11; jj+=12)
+				{
+				kernel_dlarfb4_r_12_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+				}
+			for(; jj<m-7; jj+=8)
+				{
+				kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+				}
+			for(; jj<m-3; jj+=4)
+				{
+				kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+				}
+			for(ll=0; ll<m-jj; ll++)
+				{
+				kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+				}
+			}
+		ii += 4;
+		}
+	// 1 2 3
+	if(ii<imax)
+		{
+		kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+		}
+#else // no haswell
+	for(ii=0; ii<imax-4; ii+=4)
+		{
+//		kernel_dgelqf_vs_lib4(4, n-ii, 4, 0, pD+ii*sdd+ii*ps, sdd, dD+ii);
+//		kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+//		kernel_dlarft_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+		kernel_dgelqf_dlarft4_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii, pT);
+		jj = ii+4;
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+		for(; jj<m-7; jj+=8)
+			{
+			kernel_dlarfb4_r_8_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps, sdd);
+			}
+#endif
+		for(; jj<m-3; jj+=4)
+			{
+			kernel_dlarfb4_r_4_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+jj*sdd+ii*ps);
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			kernel_dlarfb4_r_1_lib4(n-ii, pD+ii*sdd+ii*ps, pT, pD+ll+jj*sdd+ii*ps);
+			}
+		}
+	if(ii<imax)
+		{
+		if(ii==imax-4)
+			{
+			kernel_dgelqf_4_lib4(n-ii, pD+ii*sdd+ii*ps, dD+ii);
+			}
+		else
+			{
+			kernel_dgelqf_vs_lib4(m-ii, n-ii, imax-ii, ii&(ps-1), pD+ii*sdd+ii*ps, sdd, dD+ii);
+			}
+		}
+#endif // no haswell
+#endif
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas.h b/blas/s_blas.h
new file mode 100644
index 0000000..b6a92a7
--- /dev/null
+++ b/blas/s_blas.h
@@ -0,0 +1,66 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(int *m, float *x, int *incx, float *y, int *incy);
+void saxpy_(int *m, float *alpha, float *x, int *incx, float *y, int *incy);
+void sscal_(int *m, float *alpha, float *x, int *incx);
+
+// level 2
+void sgemv_(char *ta, int *m, int *n, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void ssymv_(char *uplo, int *m, float *alpha, float *A, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+void strmv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void strsv_(char *uplo, char *trans, char *diag, int *n, float *A, int *lda, float *x, int *incx);
+void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *A, int *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, int *m, int *n, int *k, float *alpha, float *A, int *lda, float *B, int *ldb, float *beta, float *C, int *ldc);
+void ssyrk_(char *uplo, char *trans, int *n, int *k, float *alpha, float *A, int *lda, float *beta, float *C, int *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, int *m, int *n, float *alpha, float *A, int *lda, float *B, int *ldb);
+
+// lapack
+int spotrf_(char *uplo, int *m, float *A, int *lda, int *info);
+int sgetrf_(int *m, int *n, float *A, int *lda, int *ipiv, int *info);
+void sgeqrf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+void sgeqr2_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *info);
+void sgelqf_(int *m, int *n, float *A, int *lda, float *tau, float *work, int *lwork, int *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_blas1_lib.c b/blas/s_blas1_lib.c
new file mode 100644
index 0000000..67fec77
--- /dev/null
+++ b/blas/s_blas1_lib.c
@@ -0,0 +1,54 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define AXPY_LIBSTR saxpy_libstr
+#define VECMULDOT_LIBSTR svecmuldot_libstr
+#define DOT_LIBSTR sdot_libstr
+
+#define AXPY saxpy_
+#define COPY scopy_
+
+
+#include "x_blas1_lib.c"
+
diff --git a/blas/s_blas1_lib4.c b/blas/s_blas1_lib4.c
new file mode 100644
index 0000000..8588020
--- /dev/null
+++ b/blas/s_blas1_lib4.c
@@ -0,0 +1,123 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	ii = 0;
+	for( ; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+	for( ; ii<m; ii++)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		}
+	return;
+	}
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	ii = 0;
+	for( ; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0];
+		y[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1];
+		y[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2];
+		y[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3];
+		y[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+	for( ; ii<m; ii++)
+		{
+		z[ii+0] = y[ii+0];
+		y[ii+0] = y[ii+0] + alpha*x[ii+0];
+		}
+	return;
+	}
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return 0.0;
+
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	float dot = 0.0;
+
+	ii = 0;
+
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		dot += z[ii+0];
+		}
+	return dot;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
diff --git a/blas/s_blas1_lib8.c b/blas/s_blas1_lib8.c
new file mode 100644
index 0000000..538c012
--- /dev/null
+++ b/blas/s_blas1_lib8.c
@@ -0,0 +1,124 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// z = y + alpha*x, with increments equal to 1
+void saxpy_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	ii = 0;
+	for( ; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+	for( ; ii<m; ii++)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		}
+	return;
+	}
+
+
+
+void saxpy_bkp_libstr(int m, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	ii = 0;
+	for( ; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0];
+		y[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1];
+		y[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2];
+		y[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3];
+		y[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+	for( ; ii<m; ii++)
+		{
+		z[ii+0] = y[ii+0];
+		y[ii+0] = y[ii+0] + alpha*x[ii+0];
+		}
+	return;
+	}
+
+
+
+// multiply two vectors and compute dot product
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return 0.0;
+
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+	int ii;
+	float dot = 0.0;
+
+	ii = 0;
+
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		dot += z[ii+0];
+		}
+	return dot;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/s_blas2_diag_lib.c b/blas/s_blas2_diag_lib.c
new file mode 100644
index 0000000..1dde42f
--- /dev/null
+++ b/blas/s_blas2_diag_lib.c
@@ -0,0 +1,46 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRVEC s_strvec
+
+#define GEMV_DIAG_LIBSTR sgemv_diag_libstr
+
+
+
+#include "x_blas2_diag_lib.c"
+
diff --git a/blas/s_blas2_lib.c b/blas/s_blas2_lib.c
new file mode 100644
index 0000000..7ab8dc2
--- /dev/null
+++ b/blas/s_blas2_lib.c
@@ -0,0 +1,72 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#include "s_blas.h"
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMV_N_LIBSTR sgemv_n_libstr
+#define GEMV_NT_LIBSTR sgemv_nt_libstr
+#define GEMV_T_LIBSTR sgemv_t_libstr
+#define SYMV_L_LIBSTR ssymv_l_libstr
+#define TRMV_LNN_LIBSTR strmv_lnn_libstr
+#define TRMV_LTN_LIBSTR strmv_ltn_libstr
+#define TRMV_UNN_LIBSTR strmv_unn_libstr
+#define TRMV_UTN_LIBSTR strmv_utn_libstr
+#define TRSV_LNN_LIBSTR strsv_lnn_libstr
+#define TRSV_LNN_MN_LIBSTR strsv_lnn_mn_libstr
+#define TRSV_LNU_LIBSTR strsv_lnu_libstr
+#define TRSV_LTN_LIBSTR strsv_ltn_libstr
+#define TRSV_LTN_MN_LIBSTR strsv_ltn_mn_libstr
+#define TRSV_LTU_LIBSTR strsv_ltu_libstr
+#define TRSV_UNN_LIBSTR strsv_unn_libstr
+#define TRSV_UTN_LIBSTR strsv_utn_libstr
+
+#define COPY scopy_
+#define GEMV sgemv_
+#define SYMV ssymv_
+#define TRMV strmv_
+#define TRSV strsv_
+
+
+
+#include "x_blas2_lib.c"
+
diff --git a/blas/s_blas2_lib4.c b/blas/s_blas2_lib4.c
new file mode 100644
index 0000000..b7a947d
--- /dev/null
+++ b/blas/s_blas2_lib4.c
@@ -0,0 +1,1045 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<0)
+		return;
+
+	const int bs = 4;
+
+	int i;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	i = 0;
+	// clean up at the beginning
+	if(ai%bs!=0)
+		{
+		kernel_sgemv_n_4_gen_lib4(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+		pA += bs*sda;
+		y += 4 - ai%bs;
+		z += 4 - ai%bs;
+		m -= 4 - ai%bs;
+		}
+	// main loop
+	for( ; i<m-3; i+=4)
+		{
+		kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+		}
+		
+	return;
+
+	}
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int i;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	if(ai%bs==0)
+		{
+		i = 0;
+		for( ; i<n-3; i+=4)
+			{
+			kernel_sgemv_t_4_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			}
+		if(i<n)
+			{
+			kernel_sgemv_t_4_vs_lib4(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		}
+	else // TODO kernel 8
+		{
+		i = 0;
+		for( ; i<n; i+=4)
+			{
+			kernel_sgemv_t_4_gen_lib4(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		}
+	
+	return;
+
+	}
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+	{
+
+	if(ai!=0)
+		{
+		printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *x_n = sx_n->pa + xi_n;
+	float *x_t = sx_t->pa + xi_t;
+	float *y_n = sy_n->pa + yi_n;
+	float *y_t = sy_t->pa + yi_t;
+	float *z_n = sz_n->pa + zi_n;
+	float *z_t = sz_t->pa + zi_t;
+
+//	sgemv_nt_lib(m, n, alpha_n, alpha_t, pA, sda, x_n, x_t, beta_n, beta_t, y_n, y_t, z_n, z_t);
+
+//	if(m<=0 | n<=0)
+//		return;
+
+	int ii;
+
+	// copy and scale y_n int z_n
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		z_n[ii+1] = beta_n*y_n[ii+1];
+		z_n[ii+2] = beta_n*y_n[ii+2];
+		z_n[ii+3] = beta_n*y_n[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		}
+	
+	ii = 0;
+	for(; ii<n-3; ii+=4)
+		{
+		kernel_sgemv_nt_4_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+		}
+	if(ii<n)
+		{
+		kernel_sgemv_nt_4_vs_lib4(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+		}
+	
+		return;
+	}
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int ii, n1;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	// copy and scale y int z
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z[ii+0] = beta*y[ii+0];
+		z[ii+1] = beta*y[ii+1];
+		z[ii+2] = beta*y[ii+2];
+		z[ii+3] = beta*y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = beta*y[ii+0];
+		}
+	
+	// clean up at the beginning
+	if(ai%bs!=0) // 1, 2, 3
+		{
+		n1 = 4-ai%bs;
+		kernel_ssymv_l_4_gen_lib4(m, &alpha, ai%bs, &pA[0], sda, &x[0], &z[0], n<n1 ? n : n1);
+		pA += n1 + n1*bs + (sda-1)*bs;
+		x += n1;
+		z += n1;
+		m -= n1;
+		n -= n1;
+		}
+	// main loop
+	ii = 0;
+	for(; ii<n-3; ii+=4)
+		{
+		kernel_ssymv_l_4_lib4(m-ii, &alpha, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii]);
+		}
+	// clean up at the end
+	if(ii<n)
+		{
+		kernel_ssymv_l_4_gen_lib4(m-ii, &alpha, 0, &pA[ii*bs+ii*sda], sda, &x[ii], &z[ii], n-ii);
+		}
+	
+	return;
+	}
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	if(m-n>0)
+		sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+	float *pA2 = pA;
+	float *z2 = z;
+	int m2 = n;
+	int n2 = 0;
+	float *pA3, *x3;
+
+	float alpha = 1.0;
+	float beta = 1.0;
+
+	float zt[4];
+
+	int ii, jj, jj_end;
+
+	ii = 0;
+
+	if(ai%4!=0)
+		{
+		pA2 += sda*bs - ai%bs;
+		z2 += bs-ai%bs;
+		m2 -= bs-ai%bs;
+		n2 += bs-ai%bs;
+		}
+	
+	pA2 += m2/bs*bs*sda;
+	z2 += m2/bs*bs;
+	n2 += m2/bs*bs;
+
+	if(m2%bs!=0)
+		{
+		//
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		zt[0] = pA3[0+bs*0]*x3[0];
+		kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, zt, zt);
+		for(jj=0; jj<m2%bs; jj++)
+			z2[jj] = zt[jj];
+		}
+	for(; ii<m2-3; ii+=4)
+		{
+		pA2 -= bs*sda;
+		z2 -= 4;
+		n2 -= 4;
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		z2[0] = pA3[0+bs*0]*x3[0];
+		kernel_sgemv_n_4_lib4(n2, &alpha, pA2, x, &beta, z2, z2);
+		}
+	if(ai%4!=0)
+		{
+		if(ai%bs==1)
+			{
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==2)
+			{
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 4-ai%bs<n ? 4-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else // if (ai%bs==3)
+			{
+			z[0] = pA[0+bs*0]*x[0];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	float xt[4];
+	float zt[4];
+
+	float alpha = 1.0;
+	float beta = 1.0;
+
+	int ii, jj, ll, ll_max;
+
+	jj = 0;
+
+	if(ai%bs!=0)
+		{
+
+		if(ai%bs==1)
+			{
+			ll_max = m-jj<3 ? m-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<3; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+			zt[2] = pA[2+bs*2]*xt[2];
+			pA += bs*sda - 1;
+			x += 3;
+			kernel_sgemv_t_4_lib4(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<3 ? n-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*3;
+			z += 3;
+			jj += 3;
+			}
+		else if(ai%bs==2)
+			{
+			ll_max = m-jj<2 ? m-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<2; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+			zt[1] = pA[1+bs*1]*xt[1];
+			pA += bs*sda - 2;
+			x += 2;
+			kernel_sgemv_t_4_lib4(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<2 ? n-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*2;
+			z += 2;
+			jj += 2;
+			}
+		else // if(ai%bs==3)
+			{
+			ll_max = m-jj<1 ? m-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<1; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0];
+			pA += bs*sda - 3;
+			x += 1;
+			kernel_sgemv_t_4_lib4(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<1 ? n-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*1;
+			z += 1;
+			jj += 1;
+			}
+
+		}
+	
+	for(; jj<n-3; jj+=4)
+		{
+		zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3];
+		zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3];
+		zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3];
+		zt[3] = pA[3+bs*3]*x[3];
+		pA += bs*sda;
+		x += 4;
+		kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, z);
+		pA += bs*4;
+		z += 4;
+		}
+	if(jj<n)
+		{
+		ll_max = m-jj<4 ? m-jj : 4;
+		for(ll=0; ll<ll_max; ll++)
+			xt[ll] = x[ll];
+		for(; ll<4; ll++)
+			xt[ll] = 0.0;
+		zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+		zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+		zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+		zt[3] = pA[3+bs*3]*xt[3];
+		pA += bs*sda;
+		x += 4;
+		kernel_sgemv_t_4_lib4(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+		for(ll=0; ll<n-jj; ll++)
+			z[ll] = zt[ll];
+//		pA += bs*4;
+//		z += 4;
+		}
+
+	return;
+
+	}
+
+
+
+void strmv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ai!=0)
+		{
+		printf("\ndtrmv_unn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int i;
+	
+	i=0;
+	for(; i<m-3; i+=4)
+		{
+		kernel_strmv_un_4_lib4(m-i, pA, x, z);
+		pA += 4*sda+4*bs;
+		x  += 4;
+		z  += 4;
+		}
+	if(m>i)
+		{
+		if(m-i==1)
+			{
+			z[0] = pA[0+bs*0]*x[0];
+			}
+		else if(m-i==2)
+			{
+			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1];
+			z[1] = pA[1+bs*1]*x[1];
+			}
+		else // if(m-i==3)
+			{
+			z[0] = pA[0+bs*0]*x[0] + pA[0+bs*1]*x[1] + pA[0+bs*2]*x[2];
+			z[1] = pA[1+bs*1]*x[1] + pA[1+bs*2]*x[2];
+			z[2] = pA[2+bs*2]*x[2];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+void strmv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ai!=0)
+		{
+		printf("\nstrmv_utn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii, idx;
+	
+	float *ptrA;
+	
+	ii=0;
+	idx = m/bs*bs;
+	if(m%bs!=0)
+		{
+		kernel_strmv_ut_4_vs_lib4(m, pA+idx*bs, sda, x, z+idx, m%bs);
+		ii += m%bs;
+		}
+	idx -= 4;
+	for(; ii<m; ii+=4)
+		{
+		kernel_strmv_ut_4_lib4(idx+4, pA+idx*bs, sda, x, z+idx);
+		idx -= 4;
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	int i;
+
+	if(x!=z)
+		{
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+		}
+	
+	i = 0;
+	for( ; i<m-3; i+=4)
+		{
+		kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+		i+=4;
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0 | n==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	if(m<n)
+		m = n;
+
+	float alpha = -1.0;
+	float beta = 1.0;
+
+	int i;
+
+	if(x!=z)
+		{
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+		}
+	
+	i = 0;
+	for( ; i<n-3; i+=4)
+		{
+		kernel_strsv_ln_inv_4_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+		}
+	if(i<n)
+		{
+		kernel_strsv_ln_inv_4_vs_lib4(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+		i+=4;
+		}
+	for( ; i<m-3; i+=4)
+		{
+		kernel_sgemv_n_4_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_sgemv_n_4_vs_lib4(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+		i+=4;
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	int i;
+	
+	if(x!=z)
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+			
+	i=0;
+	if(m%4==1)
+		{
+		kernel_strsv_lt_inv_1_lib4(i+1, &pA[m/bs*bs*sda+(m-i-1)*bs], sda, &dA[m-i-1], &z[m-i-1], &z[m-i-1], &z[m-i-1]);
+		i++;
+		}
+	else if(m%4==2)
+		{
+		kernel_strsv_lt_inv_2_lib4(i+2, &pA[m/bs*bs*sda+(m-i-2)*bs], sda, &dA[m-i-2], &z[m-i-2], &z[m-i-2], &z[m-i-2]);
+		i+=2;
+		}
+	else if(m%4==3)
+		{
+		kernel_strsv_lt_inv_3_lib4(i+3, &pA[m/bs*bs*sda+(m-i-3)*bs], sda, &dA[m-i-3], &z[m-i-3], &z[m-i-3], &z[m-i-3]);
+		i+=3;
+		}
+	for(; i<m-3; i+=4)
+		{
+		kernel_strsv_lt_inv_4_lib4(i+4, &pA[(m-i-4)/bs*bs*sda+(m-i-4)*bs], sda, &dA[m-i-4], &z[m-i-4], &z[m-i-4], &z[m-i-4]);
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	if(n>m)
+		n = m;
+	
+	int i;
+	
+	if(x!=z)
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+			
+	i=0;
+	if(n%4==1)
+		{
+		kernel_strsv_lt_inv_1_lib4(m-n+i+1, &pA[n/bs*bs*sda+(n-i-1)*bs], sda, &dA[n-i-1], &z[n-i-1], &z[n-i-1], &z[n-i-1]);
+		i++;
+		}
+	else if(n%4==2)
+		{
+		kernel_strsv_lt_inv_2_lib4(m-n+i+2, &pA[n/bs*bs*sda+(n-i-2)*bs], sda, &dA[n-i-2], &z[n-i-2], &z[n-i-2], &z[n-i-2]);
+		i+=2;
+		}
+	else if(n%4==3)
+		{
+		kernel_strsv_lt_inv_3_lib4(m-n+i+3, &pA[n/bs*bs*sda+(n-i-3)*bs], sda, &dA[n-i-3], &z[n-i-3], &z[n-i-3], &z[n-i-3]);
+		i+=3;
+		}
+	for(; i<n-3; i+=4)
+		{
+		kernel_strsv_lt_inv_4_lib4(m-n+i+4, &pA[(n-i-4)/bs*bs*sda+(n-i-4)*bs], sda, &dA[n-i-4], &z[n-i-4], &z[n-i-4], &z[n-i-4]);
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_lnu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** strsv_lnu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void strsv_ltu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** strsv_ltu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void strsv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_unn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** strsv_unn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void strsv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_utn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** strsv_utn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas2_lib8.c b/blas/s_blas2_lib8.c
new file mode 100644
index 0000000..41a78c4
--- /dev/null
+++ b/blas/s_blas2_lib8.c
@@ -0,0 +1,1008 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<0)
+		return;
+
+	const int bs = 8;
+
+	int i;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	i = 0;
+	// clean up at the beginning
+	if(ai%bs!=0)
+		{
+		kernel_sgemv_n_8_gen_lib8(n, &alpha, pA, x, &beta, y-ai%bs, z-ai%bs, ai%bs, m+ai%bs);
+		pA += bs*sda;
+		y += 8 - ai%bs;
+		z += 8 - ai%bs;
+		m -= 8 - ai%bs;
+		}
+	// main loop
+	for( ; i<m-7; i+=8)
+		{
+		kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], x, &beta, &y[i], &z[i], m-i);
+		}
+		
+	return;
+
+	}
+
+
+
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+	if(n<=0)
+		return;
+	
+	const int bs = 8;
+
+	int i;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	if(ai%bs==0)
+		{
+		i = 0;
+		for( ; i<n-7; i+=8)
+			{
+			kernel_sgemv_t_8_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i]);
+			}
+		if(i<n)
+			{
+			if(n-i<=4)
+				{
+				kernel_sgemv_t_4_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+				}
+			else
+				{
+				kernel_sgemv_t_8_vs_lib8(m, &alpha, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+				}
+			}
+		}
+	else
+		{
+		i = 0;
+		for( ; i<n-4; i+=8)
+			{
+			kernel_sgemv_t_8_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		if(i<n)
+			{
+			kernel_sgemv_t_4_gen_lib8(m, &alpha, ai%bs, &pA[i*bs], sda, x, &beta, &y[i], &z[i], n-i);
+			}
+		}
+	
+	return;
+
+	}
+
+
+
+// m >= n
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	if(m-n>0)
+		sgemv_n_libstr(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+
+	float *pA2 = pA;
+	float *z2 = z;
+	int m2 = n;
+	int n2 = 0;
+	float *pA3, *x3;
+
+	float alpha = 1.0;
+	float beta = 1.0;
+
+	float zt[8];
+
+	int ii, jj, jj_end;
+
+	ii = 0;
+
+	if(ai%bs!=0)
+		{
+		pA2 += sda*bs - ai%bs;
+		z2 += bs-ai%bs;
+		m2 -= bs-ai%bs;
+		n2 += bs-ai%bs;
+		}
+	
+	pA2 += m2/bs*bs*sda;
+	z2 += m2/bs*bs;
+	n2 += m2/bs*bs;
+
+	if(m2%bs!=0)
+		{
+		//
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		zt[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+		zt[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+		zt[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+		zt[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+		zt[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		zt[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		zt[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		zt[0] = pA3[0+bs*0]*x3[0];
+		kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, zt, zt);
+		for(jj=0; jj<m2%bs; jj++)
+			z2[jj] = zt[jj];
+		}
+	for(; ii<m2-7; ii+=8)
+		{
+		pA2 -= bs*sda;
+		z2 -= 8;
+		n2 -= 8;
+		pA3 = pA2 + bs*n2;
+		x3 = x + n2;
+		z2[7] = pA3[7+bs*0]*x3[0] + pA3[7+bs*1]*x3[1] + pA3[7+bs*2]*x3[2] + pA3[7+bs*3]*x3[3] + pA3[7+bs*4]*x3[4] + pA3[7+bs*5]*x3[5] + pA3[7+bs*6]*x3[6] + pA3[7+bs*7]*x3[7];
+		z2[6] = pA3[6+bs*0]*x3[0] + pA3[6+bs*1]*x3[1] + pA3[6+bs*2]*x3[2] + pA3[6+bs*3]*x3[3] + pA3[6+bs*4]*x3[4] + pA3[6+bs*5]*x3[5] + pA3[6+bs*6]*x3[6];
+		z2[5] = pA3[5+bs*0]*x3[0] + pA3[5+bs*1]*x3[1] + pA3[5+bs*2]*x3[2] + pA3[5+bs*3]*x3[3] + pA3[5+bs*4]*x3[4] + pA3[5+bs*5]*x3[5];
+		z2[4] = pA3[4+bs*0]*x3[0] + pA3[4+bs*1]*x3[1] + pA3[4+bs*2]*x3[2] + pA3[4+bs*3]*x3[3] + pA3[4+bs*4]*x3[4];
+		z2[3] = pA3[3+bs*0]*x3[0] + pA3[3+bs*1]*x3[1] + pA3[3+bs*2]*x3[2] + pA3[3+bs*3]*x3[3];
+		z2[2] = pA3[2+bs*0]*x3[0] + pA3[2+bs*1]*x3[1] + pA3[2+bs*2]*x3[2];
+		z2[1] = pA3[1+bs*0]*x3[0] + pA3[1+bs*1]*x3[1];
+		z2[0] = pA3[0+bs*0]*x3[0];
+		kernel_sgemv_n_8_lib8(n2, &alpha, pA2, x, &beta, z2, z2);
+		}
+	if(ai%bs!=0)
+		{
+		if(ai%bs==1)
+			{
+			zt[6] = pA[6+bs*0]*x[0] + pA[6+bs*1]*x[1] + pA[6+bs*2]*x[2] + pA[6+bs*3]*x[3] + pA[6+bs*4]*x[4] + pA[6+bs*5]*x[5] + pA[6+bs*6]*x[6];
+			zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+			zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+			zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==2)
+			{
+			zt[5] = pA[5+bs*0]*x[0] + pA[5+bs*1]*x[1] + pA[5+bs*2]*x[2] + pA[5+bs*3]*x[3] + pA[5+bs*4]*x[4] + pA[5+bs*5]*x[5];
+			zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+			zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==3)
+			{
+			zt[4] = pA[4+bs*0]*x[0] + pA[4+bs*1]*x[1] + pA[4+bs*2]*x[2] + pA[4+bs*3]*x[3] + pA[4+bs*4]*x[4];
+			zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==4)
+			{
+			zt[3] = pA[3+bs*0]*x[0] + pA[3+bs*1]*x[1] + pA[3+bs*2]*x[2] + pA[3+bs*3]*x[3];
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==5)
+			{
+			zt[2] = pA[2+bs*0]*x[0] + pA[2+bs*1]*x[1] + pA[2+bs*2]*x[2];
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else if(ai%bs==6)
+			{
+			zt[1] = pA[1+bs*0]*x[0] + pA[1+bs*1]*x[1];
+			zt[0] = pA[0+bs*0]*x[0];
+			jj_end = 8-ai%bs<n ? 8-ai%bs : n;
+			for(jj=0; jj<jj_end; jj++)
+				z[jj] = zt[jj];
+			}
+		else // if (ai%bs==7)
+			{
+			z[0] = pA[0+bs*0]*x[0];
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// m >= n
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m<=0)
+		return;
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	float xt[8];
+	float zt[8];
+
+	float alpha = 1.0;
+	float beta = 1.0;
+
+	int ii, jj, ll, ll_max;
+
+	jj = 0;
+
+	if(ai%bs!=0)
+		{
+
+		if(ai%bs==1)
+			{
+			ll_max = m-jj<7 ? m-jj : 7;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<7; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6];
+			zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6];
+			zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6];
+			zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6];
+			zt[6] = pA[6+bs*6]*xt[6];
+			pA += bs*sda - 1;
+			x += 7;
+			kernel_sgemv_t_8_lib8(m-7-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<7 ? n-jj : 7;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*7;
+			z += 7;
+			jj += 7;
+			}
+		else if(ai%bs==2)
+			{
+			ll_max = m-jj<6 ? m-jj : 6;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<6; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5];
+			zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5];
+			zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5];
+			zt[5] = pA[5+bs*5]*xt[5];
+			pA += bs*sda - 2;
+			x += 6;
+			kernel_sgemv_t_8_lib8(m-6-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<6 ? n-jj : 6;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*6;
+			z += 6;
+			jj += 6;
+			}
+		else if(ai%bs==3)
+			{
+			ll_max = m-jj<5 ? m-jj : 5;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<5; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4];
+			zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4];
+			zt[4] = pA[4+bs*4]*xt[4];
+			pA += bs*sda - 3;
+			x += 5;
+			kernel_sgemv_t_8_lib8(m-5-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<5 ? n-jj : 5;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*5;
+			z += 5;
+			jj += 5;
+			}
+		else if(ai%bs==4)
+			{
+			ll_max = m-jj<4 ? m-jj : 4;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<4; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+			zt[3] = pA[3+bs*3]*xt[3];
+			pA += bs*sda - 4;
+			x += 4;
+			kernel_sgemv_t_8_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<4 ? n-jj : 4;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*4;
+			z += 4;
+			jj += 4;
+			}
+		else if(ai%bs==5)
+			{
+			ll_max = m-jj<3 ? m-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<3; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2];
+			zt[2] = pA[2+bs*2]*xt[2];
+			pA += bs*sda - 5;
+			x += 3;
+			kernel_sgemv_t_8_lib8(m-3-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<3 ? n-jj : 3;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*3;
+			z += 3;
+			jj += 3;
+			}
+		else if(ai%bs==6)
+			{
+			ll_max = m-jj<2 ? m-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<2; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1];
+			zt[1] = pA[1+bs*1]*xt[1];
+			pA += bs*sda - 6;
+			x += 2;
+			kernel_sgemv_t_8_lib8(m-2-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<2 ? n-jj : 2;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*2;
+			z += 2;
+			jj += 2;
+			}
+		else // if(ai%bs==7)
+			{
+			ll_max = m-jj<1 ? m-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<1; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0];
+			pA += bs*sda - 7;
+			x += 1;
+			kernel_sgemv_t_8_lib8(m-1-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			ll_max = n-jj<1 ? n-jj : 1;
+			for(ll=0; ll<ll_max; ll++)
+				z[ll] = zt[ll];
+			pA += bs*1;
+			z += 1;
+			jj += 1;
+			}
+
+		}
+	
+	for(; jj<n-7; jj+=8)
+		{
+		zt[0] = pA[0+bs*0]*x[0] + pA[1+bs*0]*x[1] + pA[2+bs*0]*x[2] + pA[3+bs*0]*x[3] + pA[4+bs*0]*x[4] + pA[5+bs*0]*x[5] + pA[6+bs*0]*x[6] + pA[7+bs*0]*x[7];
+		zt[1] = pA[1+bs*1]*x[1] + pA[2+bs*1]*x[2] + pA[3+bs*1]*x[3] + pA[4+bs*1]*x[4] + pA[5+bs*1]*x[5] + pA[6+bs*1]*x[6] + pA[7+bs*1]*x[7];
+		zt[2] = pA[2+bs*2]*x[2] + pA[3+bs*2]*x[3] + pA[4+bs*2]*x[4] + pA[5+bs*2]*x[5] + pA[6+bs*2]*x[6] + pA[7+bs*2]*x[7];
+		zt[3] = pA[3+bs*3]*x[3] + pA[4+bs*3]*x[4] + pA[5+bs*3]*x[5] + pA[6+bs*3]*x[6] + pA[7+bs*3]*x[7];
+		zt[4] = pA[4+bs*4]*x[4] + pA[5+bs*4]*x[5] + pA[6+bs*4]*x[6] + pA[7+bs*4]*x[7];
+		zt[5] = pA[5+bs*5]*x[5] + pA[6+bs*5]*x[6] + pA[7+bs*5]*x[7];
+		zt[6] = pA[6+bs*6]*x[6] + pA[7+bs*6]*x[7];
+		zt[7] = pA[7+bs*7]*x[7];
+		pA += bs*sda;
+		x += 8;
+		kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, z);
+		pA += bs*8;
+		z += 8;
+		}
+	if(jj<n)
+		{
+		if(n-jj<=4)
+			{
+			ll_max = m-jj<4 ? m-jj : 4;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<4; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3];
+			zt[3] = pA[3+bs*3]*xt[3];
+			pA += bs*sda;
+			x += 4;
+			kernel_sgemv_t_4_lib8(m-4-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			for(ll=0; ll<n-jj; ll++)
+				z[ll] = zt[ll];
+//			pA += bs*4;
+//			z += 4;
+			}
+		else
+			{
+			ll_max = m-jj<8 ? m-jj : 8;
+			for(ll=0; ll<ll_max; ll++)
+				xt[ll] = x[ll];
+			for(; ll<8; ll++)
+				xt[ll] = 0.0;
+			zt[0] = pA[0+bs*0]*xt[0] + pA[1+bs*0]*xt[1] + pA[2+bs*0]*xt[2] + pA[3+bs*0]*xt[3] + pA[4+bs*0]*xt[4] + pA[5+bs*0]*xt[5] + pA[6+bs*0]*xt[6] + pA[7+bs*0]*xt[7];
+			zt[1] = pA[1+bs*1]*xt[1] + pA[2+bs*1]*xt[2] + pA[3+bs*1]*xt[3] + pA[4+bs*1]*xt[4] + pA[5+bs*1]*xt[5] + pA[6+bs*1]*xt[6] + pA[7+bs*1]*xt[7];
+			zt[2] = pA[2+bs*2]*xt[2] + pA[3+bs*2]*xt[3] + pA[4+bs*2]*xt[4] + pA[5+bs*2]*xt[5] + pA[6+bs*2]*xt[6] + pA[7+bs*2]*xt[7];
+			zt[3] = pA[3+bs*3]*xt[3] + pA[4+bs*3]*xt[4] + pA[5+bs*3]*xt[5] + pA[6+bs*3]*xt[6] + pA[7+bs*3]*xt[7];
+			zt[4] = pA[4+bs*4]*xt[4] + pA[5+bs*4]*xt[5] + pA[6+bs*4]*xt[6] + pA[7+bs*4]*xt[7];
+			zt[5] = pA[5+bs*5]*xt[5] + pA[6+bs*5]*xt[6] + pA[7+bs*5]*xt[7];
+			zt[6] = pA[6+bs*6]*xt[6] + pA[7+bs*6]*xt[7];
+			zt[7] = pA[7+bs*7]*xt[7];
+			pA += bs*sda;
+			x += 8;
+			kernel_sgemv_t_8_lib8(m-8-jj, &alpha, pA, sda, x, &beta, zt, zt);
+			for(ll=0; ll<n-jj; ll++)
+				z[ll] = zt[ll];
+//			pA += bs*8;
+//			z += 8;
+			}
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_lnn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	int i;
+
+	if(x!=z)
+		{
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+		}
+	
+	i = 0;
+	for( ; i<m-7; i+=8)
+		{
+		kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, m-i);
+		i+=8;
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0 | n==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** strsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** strsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_lnn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	if(m<n)
+		m = n;
+
+	float alpha = -1.0;
+	float beta = 1.0;
+
+	int i;
+
+	if(x!=z)
+		{
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+		}
+	
+	i = 0;
+	for( ; i<n-7; i+=8)
+		{
+		kernel_strsv_ln_inv_8_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i]);
+		}
+	if(i<n)
+		{
+		kernel_strsv_ln_inv_8_vs_lib8(i, &pA[i*sda], &dA[i], z, &z[i], &z[i], m-i, n-i);
+		i+=8;
+		}
+	for( ; i<m-7; i+=8)
+		{
+		kernel_sgemv_n_8_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i]);
+		}
+	if(i<m)
+		{
+		kernel_sgemv_n_8_vs_lib8(n, &alpha, &pA[i*sda], z, &beta, &z[i], &z[i], m-i);
+		i+=8;
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** strsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_ltn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(m, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	int i, i1;
+	
+	if(x!=z)
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+			
+	i=0;
+	i1 = m%8;
+	if(i1!=0)
+		{
+		kernel_strsv_lt_inv_8_vs_lib8(i+i1, &pA[m/bs*bs*sda+(m-i-i1)*bs], sda, &dA[m-i-i1], &z[m-i-i1], &z[m-i-i1], &z[m-i-i1], i1, i1);
+		i += i1;
+		}
+	for(; i<m-7; i+=8)
+		{
+		kernel_strsv_lt_inv_8_lib8(i+8, &pA[(m-i-8)/bs*bs*sda+(m-i-8)*bs], sda, &dA[m-i-8], &z[m-i-8], &z[m-i-8], &z[m-i-8]);
+		}
+
+	return;
+
+	}
+
+
+
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi)
+	{
+
+	if(m==0)
+		return;
+
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** strsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** strsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** strsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** strsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** strsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** strsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** strsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** strsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** strsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** strsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+
+	if(ai!=0)
+		{
+		printf("\nstrsv_ltn_mn_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *dA = sA->dA;
+	float *x = sx->pa + xi;
+	float *z = sz->pa + zi;
+
+	int ii;
+
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+
+	if(n>m)
+		n = m;
+	
+	int i, i1;
+	
+	if(x!=z)
+		for(i=0; i<m; i++)
+			z[i] = x[i];
+			
+	i=0;
+	i1 = n%8;
+	if(i1!=0)
+		{
+		kernel_strsv_lt_inv_8_vs_lib8(m-n+i1, &pA[n/bs*bs*sda+(n-i1)*bs], sda, &dA[n-i1], &z[n-i1], &z[n-i1], &z[n-i1], m-n+i1, i1);
+		i += i1;
+		}
+	for(; i<n-7; i+=8)
+		{
+		kernel_strsv_lt_inv_8_lib8(m-n+i+8, &pA[(n-i-8)/bs*bs*sda+(n-i-8)*bs], sda, &dA[n-i-8], &z[n-i-8], &z[n-i-8], &z[n-i-8]);
+		}
+
+	return;
+
+	}
+
+
+
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t)
+	{
+
+	if(ai!=0)
+		{
+		printf("\nsgemv_nt_libstr: feature not implemented yet: ai=%d\n", ai);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs; // TODO ai
+	float *x_n = sx_n->pa + xi_n;
+	float *x_t = sx_t->pa + xi_t;
+	float *y_n = sy_n->pa + yi_n;
+	float *y_t = sy_t->pa + yi_t;
+	float *z_n = sz_n->pa + zi_n;
+	float *z_t = sz_t->pa + zi_t;
+
+//	if(m<=0 | n<=0)
+//		return;
+
+	int ii;
+
+	// copy and scale y_n int z_n
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		z_n[ii+1] = beta_n*y_n[ii+1];
+		z_n[ii+2] = beta_n*y_n[ii+2];
+		z_n[ii+3] = beta_n*y_n[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z_n[ii+0] = beta_n*y_n[ii+0];
+		}
+	
+	ii = 0;
+	for(; ii<n-3; ii+=4)
+		{
+		kernel_sgemv_nt_4_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii);
+		}
+	if(ii<n)
+		{
+		kernel_sgemv_nt_4_vs_lib8(m, &alpha_n, &alpha_t, pA+ii*bs, sda, x_n+ii, x_t, &beta_t, y_t+ii, z_n, z_t+ii, n-ii);
+		}
+	
+		return;
+	}
+
+
+
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi)
+	{
+
+//	if(m<=0 | n<=0)
+//		return;
+	
+	const int bs = 8;
+
+	int ii, n1, n2;
+
+	int sda = sA->cn;
+	float *pA = sA->pA + aj*bs + ai/bs*bs*sda + ai%bs;
+	float *x = sx->pa + xi;
+	float *y = sy->pa + yi;
+	float *z = sz->pa + zi;
+
+	// copy and scale y int z
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z[ii+0] = beta*y[ii+0];
+		z[ii+1] = beta*y[ii+1];
+		z[ii+2] = beta*y[ii+2];
+		z[ii+3] = beta*y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = beta*y[ii+0];
+		}
+	
+	// clean up at the beginning
+	if(ai%bs!=0) // 1, 2, 3
+		{
+		n1 = 8-ai%bs;
+		n2 = n<n1 ? n : n1;
+		kernel_ssymv_l_4l_gen_lib8(m-0, &alpha, ai%bs, &pA[0+(0)*bs], sda, &x[0], &z[0], n2-0);
+		kernel_ssymv_l_4r_gen_lib8(m-4, &alpha, ai%bs, &pA[4+(4)*bs], sda, &x[4], &z[4], n2-4);
+		pA += n1 + n1*bs + (sda-1)*bs;
+		x += n1;
+		z += n1;
+		m -= n1;
+		n -= n1;
+		}
+	// main loop
+	ii = 0;
+	for(; ii<n-7; ii+=8)
+		{
+		kernel_ssymv_l_4l_lib8(m-ii-0, &alpha, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0]);
+		kernel_ssymv_l_4r_lib8(m-ii-4, &alpha, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4]);
+		}
+	// clean up at the end
+	if(ii<n)
+		{
+		kernel_ssymv_l_4l_gen_lib8(m-ii-0, &alpha, 0, &pA[0+(ii+0)*bs+ii*sda], sda, &x[ii+0], &z[ii+0], n-ii-0);
+		kernel_ssymv_l_4r_gen_lib8(m-ii-4, &alpha, 0, &pA[4+(ii+4)*bs+ii*sda], sda, &x[ii+4], &z[ii+4], n-ii-4);
+		}
+	
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
diff --git a/blas/s_blas3_diag_lib.c b/blas/s_blas3_diag_lib.c
new file mode 100644
index 0000000..23f8e0f
--- /dev/null
+++ b/blas/s_blas3_diag_lib.c
@@ -0,0 +1,49 @@
+
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_kernel.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GEMM_L_DIAG_LIBSTR sgemm_l_diag_libstr
+#define GEMM_R_DIAG_LIBSTR sgemm_r_diag_libstr
+
+
+
+#include "x_blas3_diag_lib.c"
+
diff --git a/blas/s_blas3_diag_lib4.c b/blas/s_blas3_diag_lib4.c
new file mode 100644
index 0000000..0319212
--- /dev/null
+++ b/blas/s_blas3_diag_lib4.c
@@ -0,0 +1,161 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void sgemm_l_diag_libstr(int m, int n, float alpha, struct s_strvec *sA, int ai, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+	if(bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsgemm_l_diag_libstr: feature not implemented yet: bi=%d, ci=%d, di=%d\n", bi, ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *dA = sA->pa + ai;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+//	sgemm_diag_left_lib(m, n, alpha, dA, pB, sdb, beta, pC, sdc, pD, sdd);
+	int ii;
+
+	ii = 0;
+	if(beta==0.0)
+		{
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgemm_diag_left_4_a0_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &pD[ii*sdd]);
+			}
+		}
+	else
+		{
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgemm_diag_left_4_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+			}
+		}
+	if(m-ii>0)
+		{
+		if(m-ii==1)
+			kernel_sgemm_diag_left_1_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		else if(m-ii==2)
+			kernel_sgemm_diag_left_2_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		else // if(m-ii==3)
+			kernel_sgemm_diag_left_3_lib4(n, &alpha, &dA[ii], &pB[ii*sdb], &beta, &pC[ii*sdc], &pD[ii*sdd]);
+		}
+	
+	return;
+
+	}
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+	if(ai!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *dB = sB->pa + bi;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	int ii;
+
+	ii = 0;
+	if(beta==0.0)
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+			}
+		}
+	else
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+			}
+		}
+	if(n-ii>0)
+		{
+		if(n-ii==1)
+			kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else if(n-ii==2)
+			kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else // if(n-ii==3)
+			kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		}
+		return;
+
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_diag_lib8.c b/blas/s_blas3_diag_lib8.c
new file mode 100644
index 0000000..8469345
--- /dev/null
+++ b/blas/s_blas3_diag_lib8.c
@@ -0,0 +1,105 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+	if(ai!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsgemm_r_diag_libstr: feature not implemented yet: ai=%d, ci=%d, di=%d\n", ai, ci, di);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *dB = sB->pa + bi;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	int ii;
+
+	ii = 0;
+	if(beta==0.0)
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_sgemm_diag_right_4_a0_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &pD[ii*bs], sdd);
+			}
+		}
+	else
+		{
+		for( ; ii<n-3; ii+=4)
+			{
+			kernel_sgemm_diag_right_4_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+			}
+		}
+	if(n-ii>0)
+		{
+		if(n-ii==1)
+			kernel_sgemm_diag_right_1_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else if(n-ii==2)
+			kernel_sgemm_diag_right_2_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		else // if(n-ii==3)
+			kernel_sgemm_diag_right_3_lib4(m, &alpha, &pA[ii*bs], sda, &dB[ii], &beta, &pC[ii*bs], sdc, &pD[ii*bs], sdd);
+		}
+		return;
+
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib.c b/blas/s_blas3_lib.c
new file mode 100644
index 0000000..dca98ff
--- /dev/null
+++ b/blas/s_blas3_lib.c
@@ -0,0 +1,70 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+
+#define GEMM_NN_LIBSTR sgemm_nn_libstr
+#define GEMM_NT_LIBSTR sgemm_nt_libstr
+#define SYRK_LN_LIBSTR ssyrk_ln_libstr
+#define SYRK_LN_MN_LIBSTR ssyrk_ln_mn_libstr
+#define TRMM_RLNN_LIBSTR strmm_rlnn_libstr
+#define TRMM_RUTN_LIBSTR strmm_rutn_libstr
+#define TRSM_LLNU_LIBSTR strsm_llnu_libstr
+#define TRSM_LUNN_LIBSTR strsm_lunn_libstr
+#define TRSM_RLTN_LIBSTR strsm_rltn_libstr
+#define TRSM_RLTU_LIBSTR strsm_rltu_libstr
+#define TRSM_RUTN_LIBSTR strsm_rutn_libstr
+
+#define COPY scopy_
+#define GEMM sgemm_
+#define SYRK ssyrk_
+#define TRMM strmm_
+#define TRSM strsm_
+
+
+
+#include "x_blas3_lib.c"
+
diff --git a/blas/s_blas3_lib4.c b/blas/s_blas3_lib4.c
new file mode 100644
index 0000000..c6be38f
--- /dev/null
+++ b/blas/s_blas3_lib4.c
@@ -0,0 +1,1062 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void sgemm_nt_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_sgemm_nt_16x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+12)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+12)*sdc], &pD[j*bs+(i+12)*sdd], m-(i+12), n-j);
+			}
+		}
+#endif
+#if defined(TARGET_ARMV7A_ARM_CORTEX_A15)  | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	for(; i<m-11; i+=12)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_sgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+			}
+		}
+#endif
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+		for(; j<n-7; j+=8)
+			{
+			kernel_sgemm_nt_8x8_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+			}
+#endif
+		for(; j<n-3; j+=4)
+			{
+			kernel_sgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+i*sdc], sdc, &pD[j*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+			}
+		}
+#endif
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+			}
+		if(j<n)
+			{
+			kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_12:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], m-(i+8), n-j);
+		}
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+0)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+0)*sdc], &pD[j*bs+(i+0)*sdd], m-(i+0), n-j);
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[j*sdb], &beta, &pC[j*bs+(i+4)*sdc], &pD[j*bs+(i+4)*sdd], m-(i+4), n-j);
+		}
+	return;
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+void sgemm_nn_lib(int m, int n, int k, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int i, j, l;
+
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_sgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+			}
+		if(j<n)
+			{
+			kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_sgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*bs], sdb, &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+void strmm_nt_ru_lib(int m, int n, float alpha, float *pA, int sda, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+	
+	int i, j;
+	
+	i = 0;
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_strmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+			}
+		if(j<n) // TODO specialized edge routine
+			{
+			kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		goto left_4;
+		}
+	
+	// common return
+	return;
+
+	left_4:
+	j = 0;
+//	for(; j<n-3; j+=4)
+	for(; j<n; j+=4)
+		{
+		kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+//	if(j<n) // TODO specialized edge routine
+//		{
+//		kernel_strmm_nt_ru_4x4_vs_lib4(n-j, &pA[j*bs+i*sda], &pB[j*bs+j*sdb], alg, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+//		}
+	return;
+
+	}
+
+
+
+// D <= B * A^{-T} , with A lower triangular with unit diagonal
+void strsm_nt_rl_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+	
+	int i, j;
+	
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_strsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda]);
+			}
+		if(j<n)
+			{
+			kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_strsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+// D <= B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void strsm_nt_ru_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+	
+	int i, j, idx;
+
+	int rn = n%4;
+
+	float *dummy;
+	
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		// clean at the end
+		if(rn>0)
+			{
+			idx = n-rn;
+			kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+			j += rn;
+			}
+		for(; j<n; j+=4)
+			{
+			idx = n-j-4;
+			kernel_strsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx]);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	left_4:
+	j = 0;
+	// TODO
+	// clean at the end
+	if(rn>0)
+		{
+		idx = n-rn;
+		kernel_strsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, rn);
+		j += rn;
+		}
+	for(; j<n; j+=4)
+		{
+		idx = n-j-4;
+		kernel_strsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*bs], &pA[idx*sda+(idx+4)*bs], &pB[i*sdb+idx*bs], &pD[i*sdd+idx*bs], &pA[idx*sda+idx*bs], &inv_diag_A[idx], m-i, 4);
+		}
+	return;
+
+	}
+
+
+
+// D <= A^{-1} * B , with A lower triangular with unit diagonal
+void strsm_nn_ll_one_lib(int m, int n, float *pA, int sda, float *pB, int sdb, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+	
+	int i, j;
+	
+	i = 0;
+
+	for( ; i<m-3; i+=4)
+		{
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_strsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs);
+			}
+		if(j<n)
+			{
+			kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+			}
+		}
+	if(i<m)
+		{
+		goto left_4;
+		}
+
+	// common return
+	return;
+
+	left_4:
+	j = 0;
+	for( ; j<n; j+=4)
+		{
+		kernel_strsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*bs, sdd, pB+i*sdb+j*bs, pD+i*sdd+j*bs, pA+i*sda+i*bs, m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+// D <= A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void strsm_nn_lu_inv_lib(int m, int n, float *pA, int sda, float *inv_diag_A, float *pB, int sdb, float *pD, int sdd)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+	
+	int i, j, idx;
+	float *dummy;
+	
+	i = 0;
+	int rm = m%4;
+	if(rm>0)
+		{
+		// TODO code expliticly the final case
+		idx = m-rm; // position of the part to do
+		j = 0;
+		for( ; j<n; j+=4)
+			{
+			kernel_strsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*bs, pD+idx*sdd+j*bs, pA+idx*sda+idx*bs, inv_diag_A+idx, rm, n-j);
+			}
+		// TODO
+		i += rm;
+		}
+//	int em = m-rm;
+	for( ; i<m; i+=4)
+		{
+		idx = m-i; // position of already done part
+		j = 0;
+		for( ; j<n-3; j+=4)
+			{
+			kernel_strsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4));
+			}
+		if(j<n)
+			{
+			kernel_strsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*bs, pD+idx*sdd+j*bs, sdd, pB+(idx-4)*sdb+j*bs, pD+(idx-4)*sdd+j*bs, pA+(idx-4)*sda+(idx-4)*bs, inv_diag_A+(idx-4), 4, n-j);
+			}
+		}
+
+	// common return
+	return;
+
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dgemm nt
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	if(ai==0 & bi==0 & ci==0 & di==0)
+		{
+		sgemm_nt_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd); 
+		return;
+		}
+	
+	pA += ai/bs*bs*sda;
+	pB += bi/bs*bs*sda;
+	int ci0 = ci-ai%bs;
+	int di0 = di-ai%bs;
+	int offsetC;
+	int offsetD;
+	if(ci0>=0)
+		{
+		pC += ci0/bs*bs*sdd;
+		offsetC = ci0%bs;
+		}
+	else
+		{
+		pC += -4*sdc;
+		offsetC = bs+ci0;
+		}
+	if(di0>=0)
+		{
+		pD += di0/bs*bs*sdd;
+		offsetD = di0%bs;
+		}
+	else
+		{
+		pD += -4*sdd;
+		offsetD = bs+di0;
+		}
+	
+	int i, j, l;
+
+	int idxB;
+
+	i = 0;
+	// clean up at the beginning
+	if(ai%bs!=0)
+		{
+		j = 0;
+		idxB = 0;
+		// clean up at the beginning
+		if(bi%bs!=0)
+			{
+			kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, ai%bs, m-i, bi%bs, n-j);
+			j += bs-bi%bs;
+			idxB += 4;
+			}
+		// main loop
+		for(; j<n; j+=4)
+			{
+			kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, ai%bs, m-i, 0, n-j);
+			idxB += 4;
+			}
+		m -= bs-ai%bs;
+		pA += bs*sda;
+		pC += bs*sdc;
+		pD += bs*sdd;
+		}
+	// main loop
+	for(; i<m; i+=4)
+		{
+		j = 0;
+		idxB = 0;
+		// clean up at the beginning
+		if(bi%bs!=0)
+			{
+			kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc]-bi%bs*bs, sdc, offsetD, &pD[j*bs+i*sdd]-bi%bs*bs, sdd, 0, m-i, bi%bs, n-j);
+			j += bs-bi%bs;
+			idxB += 4;
+			}
+		// main loop
+		for(; j<n; j+=4)
+			{
+			kernel_sgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*bs+i*sdc], sdc, offsetD, &pD[j*bs+i*sdd], sdd, 0, m-i, 0, n-j);
+			idxB += 4;
+			}
+		}
+
+	return;
+
+	}
+
+
+
+// dgemm nn
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+	if(m<=0 || n<=0)
+		return;
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsgemm_nn_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	sgemm_nn_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd); 
+	return;
+	}
+	
+
+
+// dtrsm_nn_llu
+void strsm_llnu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_llnu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int bs = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	strsm_nn_ll_one_lib(m, n, pA, sda, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_nn_lun
+void strsm_lunn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_lunn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int bs = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dA = sA->dA;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	strsm_nn_lu_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	// TODO alpha
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dA = sA->dA;
+
+	int i, j;
+	
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(i=0; i<n; i++)
+				dA[i] = 1.0 / dA[i];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(i=0; i<n; i++)
+			dA[i] = 1.0 / dA[i];
+		sA->use_dA = 0;
+		}
+
+	if(m<=0 || n<=0)
+		return;
+	
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<n-3; j+=4)
+			{
+			kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j]);
+			}
+		if(j<n)
+			{
+			kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	left_4:
+	j = 0;
+	for(; j<n; j+=4)
+		{
+		kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &pB[j*bs+i*sdb], &pD[j*bs+i*sdd], &pA[j*bs+j*sda], &dA[j], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void strsm_rltu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_rltu_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int bs = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	strsm_nt_rl_one_lib(m, n, pA, sda, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void strsm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+	const int bs = 4;
+	// TODO alpha
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dA = sA->dA;
+	int ii;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / dA[ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / dA[ii];
+		sA->use_dA = 0;
+		}
+	strsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | di!=0)
+		{
+		printf("\nstrmm_rutn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	strmm_nt_ru_lib(m, n, alpha, pA, sda, pB, sdb, 0.0, pD, sdd, pD, sdd); 
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+	{
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	pA += ai/bs*bs*sda;
+	pB += bi/bs*bs*sdb;
+	int offsetB = bi%bs;
+	int di0 = di-ai%bs;
+	int offsetD;
+	if(di0>=0)
+		{
+		pD += di0/bs*bs*sdd;
+		offsetD = di0%bs;
+		}
+	else
+		{
+		pD += -4*sdd;
+		offsetD = bs+di0;
+		}
+	
+	int ii, jj;
+
+	ii = 0;
+	if(ai%bs!=0)
+		{
+		jj = 0;
+		for(; jj<n; jj+=4)
+			{
+			kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+			}
+		m -= bs-ai%bs;
+		pA += bs*sda;
+		pD += bs*sdd;
+		}
+	if(offsetD==0)
+		{
+		for(; ii<m-3; ii+=4)
+			{
+			jj = 0;
+			for(; jj<n-5; jj+=4)
+				{
+				kernel_strmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs]);
+				}
+			for(; jj<n; jj+=4)
+				{
+				kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, 0, &pD[ii*sdd+jj*bs], sdd, 0, 4, 0, n-jj);
+				}
+			}
+		if(ii<m)
+			{
+			goto left_4;
+			}
+		}
+	else
+		{
+		for(; ii<m; ii+=4)
+			{
+			jj = 0;
+			for(; jj<n; jj+=4)
+				{
+				kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+				}
+			}
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	jj = 0;
+	for(; jj<n; jj+=4)
+		{
+		kernel_strmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+		}
+	return;
+
+	}
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+//	ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+	int i, j, l;
+
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i; j+=4)
+			{
+			kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+			}
+		kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+		}
+	kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, m-j);
+	return;
+
+	}
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nsryrk_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+//	ssyrk_nt_l_lib(m, n, k, alpha, pA, sda, pB, sdb, beta, pC, sdc, pD, sdd);
+
+	int i, j, l;
+
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+			kernel_sgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dgemm
+				{
+				kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+				}
+			else // dsyrk
+				{
+				if(j<n-3)
+					{
+					kernel_ssyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd]);
+					}
+				else
+					{
+					kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	j = 0;
+	for(; j<i && j<n; j+=4)
+		{
+		kernel_sgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_ssyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blas/s_blas3_lib8.c b/blas/s_blas3_lib8.c
new file mode 100644
index 0000000..f0f5144
--- /dev/null
+++ b/blas/s_blas3_lib8.c
@@ -0,0 +1,1325 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#if defined(DIM_CHECK)
+#include <stdio.h>
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// TODO check that sA=!sD or that if sA==sD then they do not overlap (same for sB)
+	// non-negative size
+	if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+	if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+	if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+	if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+	if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+	if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** sgemm_nt_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+k > sA->n) printf("\n***** sgemm_nt_libstr : aj+k > col(A) : %d+%d > %d *****\n", aj, k, sA->n);
+	// B: n x k
+	if(bi+n > sB->m) printf("\n***** sgemm_nt_libstr : bi+n > row(B) : %d+%d > %d *****\n", bi, n, sB->m);
+	if(bj+k > sB->n) printf("\n***** sgemm_nt_libstr : bj+k > col(B) : %d+%d > %d *****\n", bj, k, sB->n);
+	// C: m x n
+	if(ci+m > sC->m) printf("\n***** sgemm_nt_libstr : ci+m > row(C) : %d+%d > %d *****\n", ci, n, sC->m);
+	if(cj+n > sC->n) printf("\n***** sgemm_nt_libstr : cj+n > col(C) : %d+%d > %d *****\n", cj, k, sC->n);
+	// D: m x n
+	if(di+m > sD->m) printf("\n***** sgemm_nt_libstr : di+m > row(D) : %d+%d > %d *****\n", di, n, sD->m);
+	if(dj+n > sD->n) printf("\n***** sgemm_nt_libstr : dj+n > col(D) : %d+%d > %d *****\n", dj, k, sD->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(j<n-3)
+				{
+				kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+				if(j<n-4)
+					{
+					kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+					}
+				}
+			else
+				{
+				kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+				}
+			}
+		}
+	if(m-i>0)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=12)
+			{
+			goto left_12;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+//		else if(m-i<=20)
+//			{
+//			goto left_20;
+//			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(j<n-3)
+				{
+				kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+				if(j<n-4)
+					{
+					kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 8, n-(j+4));
+					}
+				}
+			else
+				{
+				kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 8, n-j);
+				}
+			}
+		}
+	if(m-i>0)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=12)
+			{
+			goto left_12;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_24:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_20:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+16)*sdc], &pD[(j+0)*bs+(i+16)*sdd], m-(i+16), n-j);
+		}
+	return;
+#endif
+
+	left_16:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, 4);
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+left_12:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+8), n-j);
+		}
+	return;
+#endif
+
+	left_8:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		}
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	left_4:
+	j = 0;
+	for(; j<n; j+=8)
+		{
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		}
+	return;
+#endif
+
+	}
+
+
+
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m==0 | n==0)
+		return;
+	
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** sgemm_nt_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** sgemm_nt_libstr : n<0 : %d<0 *****\n", n);
+	if(k<0) printf("\n****** sgemm_nt_libstr : k<0 : %d<0 *****\n", k);
+	// non-negative offset
+	if(ai<0) printf("\n****** sgemm_nt_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** sgemm_nt_libstr : aj<0 : %d<0 *****\n", aj);
+	if(bi<0) printf("\n****** sgemm_nt_libstr : bi<0 : %d<0 *****\n", bi);
+	if(bj<0) printf("\n****** sgemm_nt_libstr : bj<0 : %d<0 *****\n", bj);
+	if(ci<0) printf("\n****** sgemm_nt_libstr : ci<0 : %d<0 *****\n", ci);
+	if(cj<0) printf("\n****** sgemm_nt_libstr : cj<0 : %d<0 *****\n", cj);
+	if(di<0) printf("\n****** sgemm_nt_libstr : di<0 : %d<0 *****\n", di);
+	if(dj<0) printf("\n****** sgemm_nt_libstr : dj<0 : %d<0 *****\n", dj);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** sgemm_nn_libstr : ai+m > row(A) : %d+%d > %d *****\n\n", ai, m, sA->m);
+	if(aj+k > sA->n) printf("\n***** sgemm_nn_libstr : aj+k > col(A) : %d+%d > %d *****\n\n", aj, k, sA->n);
+	// B: k x n
+	if(bi+k > sB->m) printf("\n***** sgemm_nn_libstr : bi+k > row(B) : %d+%d > %d *****\n\n", bi, k, sB->m);
+	if(bj+n > sB->n) printf("\n***** sgemm_nn_libstr : bj+n > col(B) : %d+%d > %d *****\n\n", bj, n, sB->n);
+	// C: m x n
+	if(ci+m > sC->m) printf("\n***** sgemm_nn_libstr : ci+m > row(C) : %d+%d > %d *****\n\n", ci, n, sC->m);
+	if(cj+n > sC->n) printf("\n***** sgemm_nn_libstr : cj+n > col(C) : %d+%d > %d *****\n\n", cj, k, sC->n);
+	// D: m x n
+	if(di+m > sD->m) printf("\n***** sgemm_nn_libstr : di+m > row(D) : %d+%d > %d *****\n\n", di, n, sD->m);
+	if(dj+n > sD->n) printf("\n***** sgemm_nn_libstr : dj+n > col(D) : %d+%d > %d *****\n\n", dj, k, sD->n);
+#endif
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs + bi/bs*bs*sdb;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	int offsetB = bi%bs;
+
+	int i, j, l;
+
+	i = 0;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+			kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(j<n-3)
+				{
+				kernel_sgemm_nn_24x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+				if(j<n-4)
+					{
+					kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+					}
+				}
+			else
+				{
+				kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+#if 1
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+			kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(j<n-3)
+				{
+				kernel_sgemm_nn_16x4_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+				if(j<n-4)
+					{
+					kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, 16, n-(j+4));
+					}
+				}
+			else
+				{
+				kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, 16, n-j);
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#else
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+#if 1
+			kernel_sgemm_nn_8x8_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+#else
+			kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+			kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], &pD[(j+4)*bs+i*sdd]);
+#endif
+			}
+		if(j<n)
+			{
+			if(j<n-3)
+				{
+				kernel_sgemm_nn_8x4_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd]);
+				if(j<n-4)
+					{
+					kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+4)*bs], sdb, &beta, 0, &pC[(j+4)*bs+i*sdc], sdc, 0, &pD[(j+4)*bs+i*sdd], sdd, 0, 8, 0, n-(j+4));
+					}
+				}
+			else
+				{
+				kernel_sgemm_nn_8x4_gen_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, 0, &pC[(j+0)*bs+i*sdc], sdc, 0, &pD[(j+0)*bs+i*sdd], sdd, 0, 8, 0, n-j);
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_8;
+		}
+#endif
+#endif
+
+	// common return if i==m
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nn_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+#endif
+
+	left_16:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+4)*bs], sdb, &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nn_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-j);
+		}
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_sgemm_nn_8x8_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		kernel_sgemm_nn_8x4_vs_lib8(k, &alpha, &pA[i*sda], offsetB, &pB[(j+0)*bs], sdb, &beta, &pC[(j+0)*bs+i*sdc], &pD[(j+0)*bs+i*sdd], m-i, n-j);
+		}
+	return;
+
+	}
+
+
+
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ci>0 | di>0)
+		{
+		printf("\nssyrk_ln_libstr: feature not implemented yet: ci>0, di>0\n");
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int i, j;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<i; j+=8)
+			{
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+
+		kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+		kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+		kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+		kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+		kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=12)
+			{
+			goto left_12;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<i; j+=8)
+			{
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+		kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+		kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=12)
+			{
+			goto left_12;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24: // 17 <= m <= 23
+	j = 0;
+	for(; j<i & j<m-7; j+=8)
+		{
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+		}
+	kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+	kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+	kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+8));
+	kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), m-(j+12));
+	if(j<m-20) // 21 - 23
+		{
+		kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+		}
+	else // 17 18 19 20
+		{
+		kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+16)*sda], &pB[0+(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), m-(j+16));
+		}
+	return;
+#endif
+
+	left_16: // 13 <= m <= 16
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, m-(j+0));
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, m-(j+4));
+		}
+	kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+0));
+	kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), m-(j+4));
+	if(j<m-12) // 13 - 16
+		{
+		kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+		}
+	else // 9 - 12
+		{
+		kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+		}
+	return;
+
+	left_12: // 9 <= m <= 12
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+8)*sdc], &pD[(j+0)*bs+(i+8)*sdd], m-(i+0), m-(j+0));
+		}
+	kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+	kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+8)*sdc], &pD[(j+0)*bs+(j+8)*sdd], m-(i+8), m-(j+0));
+	if(j<m-8) // 9 - 12
+		{
+		kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+8)*sda], &pB[0+(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), m-(j+8));
+		}
+	return;
+
+	left_8: // 5 <= m <= 8
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+		}
+	if(j<m-4) // 5 - 8
+		{
+		kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+		}
+	else // 1 - 4
+		{
+		kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+		}
+	return;
+
+	left_4: // 1 <= m <= 4
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_sgemm_nt_4x8_vs_lib8(k, &alpha, &pA[(i+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(i+0)*sdc], &pD[(j+0)*bs+(i+0)*sdd], m-(i+0), m-(j+0));
+		}
+	kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(j+0)*sda], &pB[0+(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], &pD[(j+0)*bs+(j+0)*sdd], m-(i+0), m-(j+0));
+	return;
+
+	}
+
+
+
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ci>0 | di>0)
+		{
+		printf("\nssyrk_ln_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int i, j;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_24x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-23)
+					{
+					kernel_ssyrk_nt_l_24x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+					kernel_ssyrk_nt_l_20x4_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+					kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd);
+					kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd);
+					kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 23
+						{
+						kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+						kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+						if(j==n-8)
+							return;
+						if(j<n-12) // 13 - 23
+							{
+							kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+							kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+							if(j==n-16)
+								return;
+							if(j<n-20) // 21 - 23
+								{
+								kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+								}
+							else // 17 18 19 20
+								{
+								kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+								}
+							}
+						else // 9 10 11 12
+							{
+							kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd);
+			kernel_sgemm_nt_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-15)
+					{
+					kernel_ssyrk_nt_l_16x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd);
+					kernel_ssyrk_nt_l_12x4_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd);
+					kernel_ssyrk_nt_l_8x8_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 15
+						{
+						kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+						kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+						if(j==n-8) // 8
+							return;
+						if(j<n-12) // 13 - 15
+							{
+							kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+							}
+						else // 9 10 11 12
+							{
+							kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+		kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_nt_24x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 23
+				{
+				kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], &beta, &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+0));
+				kernel_ssyrk_nt_l_20x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], &beta, &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 23
+					{
+					kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+					kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], &beta, &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+12));
+					if(j>=n-16)
+						return;
+					if(j<n-20) // 21 - 23
+						{
+						kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+						}
+					else // 17 18 19 20
+						{
+						kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+16)*sda], &pB[(j+16)*sdb], &beta, &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], m-(i+16), n-(j+16));
+						}
+					}
+				else // 9 10 11 12
+					{
+					kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_nt_l_24x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+#endif
+
+	left_16:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+		kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_nt_16x4_vs_lib8(k, &alpha, &pA[i*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 15
+				{
+				kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[0+j*sdb], &beta, &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, m-(i+0), n-(j+0));
+				kernel_ssyrk_nt_l_12x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[4+j*sdb], &beta, &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 15
+					{
+					kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+					}
+				else // 9 - 12
+					{
+					kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_nt_l_16x4_vs_lib8(k, &alpha, &pA[(i+0)*sda], sda, &pB[j*sdb], &beta, &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_nt_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_sgemm_nt_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], m-i, n-j);
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_ssyrk_nt_l_8x8_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_nt_l_8x4_vs_lib8(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], m-i, n-j);
+				}
+			}
+		}
+	return;
+
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sB, int bi, int bj, struct s_strmat *sA, int ai, int aj, struct s_strmat *sD, int di, int dj)
+	{
+
+	const int bs = 8;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+
+	pA += ai/bs*bs*sda;
+	pB += bi/bs*bs*sdb;
+	int offsetB = bi%bs;
+	int di0 = di-ai%bs;
+	int offsetD;
+	if(di0>=0)
+		{
+		pD += di0/bs*bs*sdd;
+		offsetD = di0%bs;
+		}
+	else
+		{
+		pD += -8*sdd;
+		offsetD = bs+di0;
+		}
+	
+	int ii, jj;
+
+	int offsetB4;
+
+	if(offsetB<4)
+		{
+		offsetB4 = offsetB+4;
+		ii = 0;
+		if(ai%bs!=0)
+			{
+			jj = 0;
+			for(; jj<n-4; jj+=8)
+				{
+				kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+				kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+				}
+			m -= bs-ai%bs;
+			pA += bs*sda;
+			pD += bs*sdd;
+			}
+		if(offsetD==0)
+			{
+#if defined(TARGET_X64_INTEL_HASWELL)
+			// XXX create left_24 once the _gen_ kernel exist !!!
+			for(; ii<m-23; ii+=24)
+				{
+				jj = 0;
+				for(; jj<n-7; jj+=8)
+					{
+					kernel_strmm_nn_rl_24x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+					kernel_strmm_nn_rl_24x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+					}
+				if(n-jj>0)
+					{
+					kernel_strmm_nn_rl_24x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 24, n-jj);
+					if(n-jj>4)
+						{
+						kernel_strmm_nn_rl_24x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 24, n-jj-4);
+						}
+					}
+				}
+#endif
+			for(; ii<m-15; ii+=16)
+				{
+				jj = 0;
+				for(; jj<n-7; jj+=8)
+					{
+					kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+					kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+					}
+				if(n-jj>0)
+					{
+					kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 16, n-jj);
+					if(n-jj>4)
+						{
+						kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 16, n-jj-4);
+						}
+					}
+				}
+			if(m-ii>0)
+				{
+				if(m-ii<=8)
+					goto left_8;
+				else
+					goto left_16;
+				}
+			}
+		else
+			{
+			for(; ii<m-8; ii+=16)
+				{
+				jj = 0;
+				for(; jj<n-4; jj+=8)
+					{
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+					}
+				if(n-jj>0)
+					{
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+					}
+				}
+			if(m-ii>0)
+				goto left_8;
+			}
+		}
+	else
+		{
+		offsetB4 = offsetB-4;
+		ii = 0;
+		if(ai%bs!=0)
+			{
+			jj = 0;
+			for(; jj<n-4; jj+=8)
+				{
+				kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, ai%bs, m-ii, 0, n-jj);
+				kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, ai%bs, m-ii, 0, n-jj-4);
+				}
+			m -= bs-ai%bs;
+			pA += bs*sda;
+			pD += bs*sdd;
+			}
+		if(offsetD==0)
+			{
+			for(; ii<m-15; ii+=16)
+				{
+				jj = 0;
+				for(; jj<n-7; jj+=8)
+					{
+					kernel_strmm_nn_rl_16x4_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd);
+					kernel_strmm_nn_rl_16x4_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd);
+					}
+				if(n-jj>0)
+					{
+					kernel_strmm_nn_rl_16x4_vs_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, &pD[ii*sdd+jj*bs], sdd, 8, n-jj);
+					if(n-jj>4)
+						{
+						kernel_strmm_nn_rl_16x4_vs_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, &pD[ii*sdd+(jj+4)*bs], sdd, 8, n-jj-4);
+						}
+					}
+				}
+			if(m-ii>0)
+				{
+				if(m-ii<=8)
+					goto left_8;
+				else
+					goto left_16;
+				}
+			}
+		else
+			{
+			for(; ii<m-8; ii+=16)
+				{
+				jj = 0;
+				for(; jj<n-4; jj+=8)
+					{
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+					}
+				if(n-jj>0)
+					{
+					kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+					}
+				}
+			if(m-ii>0)
+				goto left_8;
+			}
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_16:
+	if(offsetB<4)
+		{
+		jj = 0;
+		for(; jj<n-4; jj+=8)
+			{
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+			}
+		if(n-jj>0)
+			{
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			}
+		}
+	else
+		{
+		jj = 0;
+		for(; jj<n-4; jj+=8)
+			{
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], sda, offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+			}
+		if(n-jj>0)
+			{
+			kernel_strmm_nn_rl_16x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], sda, offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			}
+		}
+	return;
+
+	left_8:
+	if(offsetB<4)
+		{
+		jj = 0;
+		for(; jj<n-4; jj+=8)
+			{
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[jj*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+			}
+		if(n-jj>0)
+			{
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			}
+		}
+	else
+		{
+		jj = 0;
+		for(; jj<n-4; jj+=8)
+			{
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj-4, &alpha, &pA[ii*sda+(jj+4)*bs], offsetB4, &pB[(jj+8)*sdb+(jj+4)*bs], sdb, offsetD, &pD[ii*sdd+(jj+4)*bs], sdd, 0, m-ii, 0, n-jj-4);
+			}
+		if(n-jj>0)
+			{
+			kernel_strmm_nn_rl_8x4_gen_lib8(n-jj, &alpha, &pA[ii*sda+jj*bs], offsetB, &pB[jj*sdb+jj*bs], sdb, offsetD, &pD[ii*sdd+jj*bs], sdd, 0, m-ii, 0, n-jj);
+			}
+		}
+	return;
+
+	}
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
+		{
+		printf("\nstrsm_rltn_libstr: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	// TODO alpha
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dA = sA->dA;
+
+	int i, j;
+	
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+			for(i=0; i<n; i++)
+				dA[i] = 1.0 / dA[i];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		sdiaex_lib(n, 1.0, ai, pA, sda, dA);
+		for(i=0; i<n; i++)
+			dA[i] = 1.0 / dA[i];
+		sA->use_dA = 0;
+		}
+
+	if(m<=0 || n<=0)
+		return;
+	
+	i = 0;
+
+	for(; i<m-7; i+=8)
+		{
+		j = 0;
+		for(; j<n-7; j+=8)
+			{
+			kernel_strsm_nt_rl_inv_8x4_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0]);
+			kernel_strsm_nt_rl_inv_8x4_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+0]);
+			}
+		if(n-j>0)
+			{
+			kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+			if(n-j>4)
+				{
+				kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_8;
+		}
+
+	// common return if i==m
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<n-4; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+		kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+4, &pD[i*sdd], &pA[4+j*sda], &pB[(j+4)*bs+i*sdb], &pD[(j+4)*bs+i*sdd], &pA[4+(j+4)*bs+j*sda], &dA[j+4], m-i, n-j-4);
+		}
+	if(n-j>0)
+		{
+		kernel_strsm_nt_rl_inv_8x4_vs_lib8(j+0, &pD[i*sdd], &pA[0+j*sda], &pB[(j+0)*bs+i*sdb], &pD[(j+0)*bs+i*sdd], &pA[0+(j+0)*bs+j*sda], &dA[j+0], m-i, n-j-0);
+		}
+	return;
+
+	}
+
+
+
+
diff --git a/blas/s_blas_64.h b/blas/s_blas_64.h
new file mode 100644
index 0000000..1589867
--- /dev/null
+++ b/blas/s_blas_64.h
@@ -0,0 +1,65 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// headers to reference BLAS and LAPACK routines employed in BLASFEO WR
+
+// level 1
+void scopy_(long long *m, float *x, long long *incx, float *y, long long *incy);
+void saxpy_(long long *m, float *alpha, float *x, long long *incx, float *y, long long *incy);
+void sscal_(long long *m, float *alpha, float *x, long long *incx);
+
+// level 2
+void sgemv_(char *ta, long long *m, long long *n, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void ssymv_(char *uplo, long long *m, float *alpha, float *A, long long *lda, float *x, long long *incx, float *beta, float *y, long long *incy);
+void strmv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void strsv_(char *uplo, char *trans, char *diag, long long *n, float *A, long long *lda, float *x, long long *incx);
+void sger_(long long *m, long long *n, float *alpha, float *x, long long *incx, float *y, long long *incy, float *A, long long *lda);
+
+// level 3
+void sgemm_(char *ta, char *tb, long long *m, long long *n, long long *k, float *alpha, float *A, long long *lda, float *B, long long *ldb, float *beta, float *C, long long *ldc);
+void ssyrk_(char *uplo, char *trans, long long *n, long long *k, float *alpha, float *A, long long *lda, float *beta, float *C, long long *ldc);
+void strmm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+void strsm_(char *side, char *uplo, char *transa, char *diag, long long *m, long long *n, float *alpha, float *A, long long *lda, float *B, long long *ldb);
+
+// lapack
+long long spotrf_(char *uplo, long long *m, float *A, long long *lda, long long *info);
+long long sgetrf_(long long *m, long long *n, float *A, long long *lda, long long *ipiv, long long *info);
+void sgeqrf_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *lwork, long long *info);
+void sgeqr2_(long long *m, long long *n, float *A, long long *lda, float *tau, float *work, long long *info);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/blas/s_lapack_lib.c b/blas/s_lapack_lib.c
new file mode 100644
index 0000000..c7cb56b
--- /dev/null
+++ b/blas/s_lapack_lib.c
@@ -0,0 +1,76 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#if defined(LA_BLAS)
+#if defined(REF_BLAS_BLIS)
+#include "s_blas_64.h"
+#else
+#include "s_blas.h"
+#endif
+#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+
+
+
+#define REAL float
+
+#define STRMAT s_strmat
+#define STRVEC s_strvec
+
+#define GELQF_LIBSTR sgelqf_libstr
+#define GELQF_WORK_SIZE_LIBSTR sgelqf_work_size_libstr
+#define GEQRF_LIBSTR sgeqrf_libstr
+#define GEQRF_WORK_SIZE_LIBSTR sgeqrf_work_size_libstr
+#define GETF2_NOPIVOT sgetf2_nopivot
+#define GETRF_NOPIVOT_LIBSTR sgetrf_nopivot_libstr
+#define GETRF_LIBSTR sgetrf_libstr
+#define POTRF_L_LIBSTR spotrf_l_libstr
+#define POTRF_L_MN_LIBSTR spotrf_l_mn_libstr
+#define SYRK_POTRF_LN_LIBSTR ssyrk_spotrf_ln_libstr
+
+#define COPY scopy_
+#define GELQF sgelqf_
+#define GEMM sgemm_
+#define GER sger_
+#define GEQRF sgeqrf_
+#define GEQR2 sgeqr2_
+#define GETRF sgetrf_
+#define POTRF spotrf_
+#define SCAL sscal_
+#define SYRK ssyrk_
+#define TRSM strsm_
+
+
+#include "x_lapack_lib.c"
+
diff --git a/blas/s_lapack_lib4.c b/blas/s_lapack_lib4.c
new file mode 100644
index 0000000..7d02d36
--- /dev/null
+++ b/blas/s_lapack_lib4.c
@@ -0,0 +1,664 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+/****************************
+* old interface
+****************************/
+
+void ssyrk_spotrf_nt_l_lib(int m, int n, int k, float *pA, int sda, float *pB, int sdb, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	int alg = 1; // XXX
+
+	const int bs = 4;
+
+	int i, j, l;
+
+	i = 0;
+
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+			kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dgemm
+				{
+				kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+				}
+			else // dsyrk
+				{
+				if(j<n-3)
+					{
+					kernel_ssyrk_spotrf_nt_l_4x4_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j]);
+					}
+				else
+					{
+					kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	j = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+		kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		if(j<i) // dgemm
+			{
+			kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+			}
+		else // dsyrk
+			{
+			kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &inv_diag_D[j], m-i, n-j);
+			}
+		}
+	return;
+
+	}
+
+
+
+void sgetrf_nn_nopivot_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+	
+	const int bs = 4;
+
+	int ii, jj, ie;
+
+	// main loop
+	ii = 0;
+	for( ; ii<m-3; ii+=4)
+		{
+		jj = 0;
+		// solve lower
+		ie = n<ii ? n : ii; // ie is multiple of 4
+		for( ; jj<ie-3; jj+=4)
+			{
+			kernel_strsm_nn_ru_inv_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj]);
+			}
+		if(jj<ie)
+			{
+			kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+			jj+=4;
+			}
+		// factorize
+		if(jj<n-3)
+			{
+			kernel_sgetrf_nn_4x4_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj]);
+			jj+=4;
+			}
+		else if(jj<n)
+			{
+			kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+			jj+=4;
+			}
+		// solve upper 
+		for( ; jj<n-3; jj+=4)
+			{
+			kernel_strsm_nn_ll_one_4x4_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd]);
+			}
+		if(jj<n)
+			{
+			kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+			}
+		}
+	if(m>ii)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	left_4:
+	jj = 0;
+	// solve lower
+	ie = n<ii ? n : ii; // ie is multiple of 4
+	for( ; jj<ie; jj+=4)
+		{
+		kernel_strsm_nn_ru_inv_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[jj*bs+jj*sdd], &inv_diag_D[jj], m-ii, ie-jj);
+		}
+	// factorize
+	if(jj<n)
+		{
+		kernel_sgetrf_nn_4x4_vs_lib4(jj, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &inv_diag_D[jj], m-ii, n-jj);
+		jj+=4;
+		}
+	// solve upper 
+	for( ; jj<n; jj+=4)
+		{
+		kernel_strsm_nn_ll_one_4x4_vs_lib4(ii, &pD[ii*sdd], &pD[jj*bs], sdd, &pC[jj*bs+ii*sdc], &pD[jj*bs+ii*sdd], &pD[ii*bs+ii*sdd], m-ii, n-jj);
+		}
+	return;
+
+	}
+
+
+
+void sgetrf_nn_lib(int m, int n, float *pC, int sdc, float *pD, int sdd, float *inv_diag_D, int *ipiv)
+	{
+
+	if(m<=0)
+		return;
+	
+	const int bs = 4;
+
+	int ii, jj, i0, i1, j0, ll, p;
+
+	float d1 = 1.0;
+	float dm1 = -1.0;
+
+//	// needs to perform row-excanges on the yet-to-be-factorized matrix too
+//	if(pC!=pD)
+//		sgecp_lib(m, n, 1.0, 0, pC, sdc, 0, pD, sdd);
+
+	// minimum matrix size
+	p = n<m ? n : m; // XXX
+
+	// main loop
+	// 4 columns at a time
+	jj = 0;
+	for(; jj<p-3; jj+=4) // XXX
+		{
+		// pivot & factorize & solve lower
+		ii = jj;
+		i0 = ii;
+		for( ; ii<m-3; ii+=4)
+			{
+			kernel_sgemm_nn_4x4_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd]);
+			}
+		if(m-ii>0)
+			{
+			kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, 4);
+			}
+		kernel_sgetrf_pivot_4_lib4(m-i0, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+		ipiv[i0+0] += i0;
+		if(ipiv[i0+0]!=i0+0)
+			{
+			srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+			}
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+			}
+		ipiv[i0+2] += i0;
+		if(ipiv[i0+2]!=i0+2)
+			{
+			srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+			}
+		ipiv[i0+3] += i0;
+		if(ipiv[i0+3]!=i0+3)
+			{
+			srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+			}
+
+		// solve upper
+		ll = jj+4;
+		for( ; ll<n-3; ll+=4)
+			{
+			kernel_strsm_nn_ll_one_4x4_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd]);
+			}
+		if(n-ll>0)
+			{
+			kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], 4, n-ll);
+			}
+		}
+	if(m>=n)
+		{
+		if(n-jj>0)
+			{
+			goto left_n_4;
+			}
+		}
+	else
+		{
+		if(m-jj>0)
+			{
+			goto left_m_4;
+			}
+		}
+
+	// common return if jj==n
+	return;
+
+	// clean up
+
+	left_n_4:
+	// 1-4 columns at a time
+	// pivot & factorize & solve lower
+	ii = jj;
+	i0 = ii;
+	for( ; ii<m; ii+=4)
+		{
+		kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+		}
+	kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+		srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+		}
+	if(n-jj>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+			}
+		if(n-jj>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+				srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+				}
+			if(n-jj>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+					srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	if(0) // there is no upper
+		{
+		ll = jj+4;
+		for( ; ll<n; ll+=4)
+			{
+			kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+			}
+		}
+	return;
+
+
+	left_m_4:
+	// 1-4 rows at a time
+	// pivot & factorize & solve lower
+	ii = jj;
+	i0 = ii;
+	kernel_sgemm_nn_4x4_vs_lib4(jj, &dm1, &pD[ii*sdd], &pD[jj*bs], sdd, &d1, &pD[jj*bs+ii*sdd], &pD[jj*bs+ii*sdd], m-ii, n-jj);
+	kernel_sgetrf_pivot_4_vs_lib4(m-i0, n-jj, &pD[jj*bs+i0*sdd], sdd, &inv_diag_D[jj], &ipiv[i0]);
+	ipiv[i0+0] += i0;
+	if(ipiv[i0+0]!=i0+0)
+		{
+		srowsw_lib(jj, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs);
+		srowsw_lib(n-jj-4, pD+(i0+0)/bs*bs*sdd+(i0+0)%bs+(jj+4)*bs, pD+(ipiv[i0+0])/bs*bs*sdd+(ipiv[i0+0])%bs+(jj+4)*bs);
+		}
+	if(m-i0>1)
+		{
+		ipiv[i0+1] += i0;
+		if(ipiv[i0+1]!=i0+1)
+			{
+			srowsw_lib(jj, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs);
+			srowsw_lib(n-jj-4, pD+(i0+1)/bs*bs*sdd+(i0+1)%bs+(jj+4)*bs, pD+(ipiv[i0+1])/bs*bs*sdd+(ipiv[i0+1])%bs+(jj+4)*bs);
+			}
+		if(m-i0>2)
+			{
+			ipiv[i0+2] += i0;
+			if(ipiv[i0+2]!=i0+2)
+				{
+				srowsw_lib(jj, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs);
+				srowsw_lib(n-jj-4, pD+(i0+2)/bs*bs*sdd+(i0+2)%bs+(jj+4)*bs, pD+(ipiv[i0+2])/bs*bs*sdd+(ipiv[i0+2])%bs+(jj+4)*bs);
+				}
+			if(m-i0>3)
+				{
+				ipiv[i0+3] += i0;
+				if(ipiv[i0+3]!=i0+3)
+					{
+					srowsw_lib(jj, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs);
+					srowsw_lib(n-jj-4, pD+(i0+3)/bs*bs*sdd+(i0+3)%bs+(jj+4)*bs, pD+(ipiv[i0+3])/bs*bs*sdd+(ipiv[i0+3])%bs+(jj+4)*bs);
+					}
+				}
+			}
+		}
+
+	// solve upper
+	ll = jj+4;
+	for( ; ll<n; ll+=4)
+		{
+		kernel_strsm_nn_ll_one_4x4_vs_lib4(i0, &pD[i0*sdd], &pD[ll*bs], sdd, &pD[ll*bs+i0*sdd], &pD[ll*bs+i0*sdd], &pD[i0*bs+i0*sdd], m-i0, n-ll);
+		}
+	return;
+
+	}
+
+
+
+/****************************
+* new interface
+****************************/
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+
+
+// dpotrf
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ci!=0 | di!=0)
+		{
+		printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA;
+	if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	int i, j, l;
+
+	i = 0;
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i; j+=4)
+			{
+			kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+			}
+		kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4: // 1 - 3
+	j = 0;
+	for(; j<i; j+=4)
+		{
+		kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		}
+	kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+	return;
+
+	return;
+	}
+
+
+
+// dpotrf
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	if(ci!=0 | di!=0)
+		{
+		printf("\nspotrf_l_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+
+	const int bs = 4;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA;
+	if(di==0 && dj==0) // XXX what to do if di and dj are not zero
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	int i, j, l;
+
+	i = 0;
+	for(; i<m-3; i+=4)
+		{
+		j = 0;
+		for(; j<i && j<n-3; j+=4)
+			{
+			kernel_strsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // dpotrf
+				{
+				if(j<n-3)
+					{
+					kernel_spotrf_nt_l_4x4_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j]);
+					}
+				else
+					{
+					kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		goto left_4;
+		}
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+	left_4:
+	j = 0;
+	for(; j<i && j<n-3; j+=4)
+		{
+		kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_strsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+			}
+		else // dpotrf
+			{
+			kernel_spotrf_nt_l_4x4_vs_lib4(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+			}
+		}
+	return;
+
+	return;
+	}
+
+
+
+// dsyrk dpotrf
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+	ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+void sgetrf_nopivot_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+	if(ci!=0 | di!=0)
+		{
+		printf("\nsgetf_nopivot_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+	sgetrf_nn_nopivot_lib(m, n, pC, sdc, pD, sdd, dD);
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	return;
+	}
+
+
+
+
+// dgetrf pivoting
+void sgetrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, int *ipiv)
+	{
+	if(ci!=0 | di!=0)
+		{
+		printf("\nsgetrf_libstr: feature not implemented yet: ci=%d, di=%d\n", ci, di);
+		exit(1);
+		}
+	const int bs = 4;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+	// needs to perform row-excanges on the yet-to-be-factorized matrix too
+	if(pC!=pD)
+		sgecp_libstr(m, n, sC, ci, cj, sD, di, dj);
+	sgetrf_nn_lib(m, n, pC, sdc, pD, sdd, dD, ipiv);
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	return;
+	}
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+	{
+	printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+	exit(1);
+	return 0;
+	}
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	printf("\nsgeqrf_libstr: feature not implemented yet\n");
+	exit(1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
diff --git a/blas/s_lapack_lib8.c b/blas/s_lapack_lib8.c
new file mode 100644
index 0000000..3b5239e
--- /dev/null
+++ b/blas/s_lapack_lib8.c
@@ -0,0 +1,872 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+
+
+
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0)
+		return;
+
+	if(ci>0 | di>0)
+		{
+		printf("\nspotrf_l_libstr: feature not implemented yet: ci>0, di>0\n");
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int i, j;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<i; j+=8)
+			{
+			kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+		kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+		kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+		kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+		kernel_spotrf_nt_l_8x8_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+		}
+	if(m>i)
+		{
+		if(m-i<=4)
+			{
+			goto left_4;
+			}
+		else if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=12)
+			{
+			goto left_12;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<i; j+=8)
+			{
+			kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+		kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+		kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24: // 17 <= m <= 23
+	j = 0;
+	for(; j<i & j<m-7; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+		kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+		}
+	kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+	kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+	kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), m-(j+8));
+	kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), m-(j+12));
+	if(j<m-20) // 21 - 23
+		{
+		kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+		}
+	else // 17 18 19 20
+		{
+		kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), m-(j+16));
+		}
+	return;
+#endif
+
+	left_16: // 9 <= m <= 16
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, m-(j+0));
+		kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, m-(j+4));
+		}
+	kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), m-(j+0));
+	kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), m-(j+4));
+	if(j<m-12) // 13 - 16
+		{
+		kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+		}
+	else // 9 - 12
+		{
+		kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), m-(j+8));
+		}
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_12: // 9 <= m <= 12
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+		}
+	kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+	kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[(i+8)*sdd], &pD[j*sdd], &pC[j*bs+(i+8)*sdc], &pD[j*bs+(i+8)*sdd], &pD[j*bs+j*sdd], &dD[j], m-(i+8), m-j);
+	if(j<m-8) // 9 - 12
+		{
+		kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[(j+8)], m-(i+8), m-(j+8));
+		}
+	return;
+#endif
+
+	left_8: // 1 <= m <= 8
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		}
+	if(j<m-4) // 5 - 8
+		{
+		kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		}
+	else // 1 - 4
+		{
+		kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		}
+	return;
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_4: // 1 <= m <= 4
+	j = 0;
+	for(; j<i; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_4x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+		}
+	kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, m-j);
+	return;
+#endif
+
+	}
+
+
+
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(m<=0 | n<=0)
+		return;
+
+	if(ci>0 | di>0)
+		{
+		printf("\nspotrf_l_mn_libstr: feature not implemented yet: ci>0, di>0\n");
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int i, j;
+
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_strsm_nt_rl_inv_24x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_strsm_nt_rl_inv_24x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-23)
+					{
+					kernel_spotrf_nt_l_24x4_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+					kernel_spotrf_nt_l_20x4_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+					kernel_spotrf_nt_l_16x4_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+					kernel_spotrf_nt_l_12x4_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+					kernel_spotrf_nt_l_8x8_lib8((j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 23
+						{
+						kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+						kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+						if(j==n-8)
+							return;
+						if(j<n-12) // 13 - 23
+							{
+							kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+							kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+							if(j==n-16)
+								return;
+							if(j<n-20) // 21 - 23
+								{
+								kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+								}
+							else // 17 18 19 20
+								{
+								kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+								}
+							}
+						else // 9 10 11 12
+							{
+							kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_strsm_nt_rl_inv_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_strsm_nt_rl_inv_16x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-15)
+					{
+					kernel_spotrf_nt_l_16x4_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+					kernel_spotrf_nt_l_12x4_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+					kernel_spotrf_nt_l_8x8_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 15
+						{
+						kernel_spotrf_nt_l_16x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+						kernel_spotrf_nt_l_12x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+						if(j==n-8) // 8
+							return;
+						if(j<n-12) // 13 - 15
+							{
+							kernel_spotrf_nt_l_8x8_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+							}
+						else // 9 10 11 12
+							{
+							kernel_spotrf_nt_l_8x4_vs_lib8(j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+		kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_strsm_nt_rl_inv_24x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 23
+				{
+				kernel_spotrf_nt_l_24x4_vs_lib8((j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+				kernel_spotrf_nt_l_20x4_vs_lib8((j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 23
+					{
+					kernel_spotrf_nt_l_16x4_vs_lib8((j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+					kernel_spotrf_nt_l_12x4_vs_lib8((j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+					if(j>=n-16)
+						return;
+					if(j<n-20) // 21 - 23
+						{
+						kernel_spotrf_nt_l_8x8_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+						}
+					else // 17 18 19 20
+						{
+						kernel_spotrf_nt_l_8x4_vs_lib8(j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+						}
+					}
+				else // 9 10 11 12
+					{
+					kernel_spotrf_nt_l_16x4_vs_lib8(j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_spotrf_nt_l_24x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+#endif
+
+	left_16:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+		kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_strsm_nt_rl_inv_16x4_vs_lib8(j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 15
+				{
+				kernel_spotrf_nt_l_16x4_vs_lib8(j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+				kernel_spotrf_nt_l_12x4_vs_lib8(j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 15
+					{
+					kernel_spotrf_nt_l_8x8_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+					}
+				else // 9 - 12
+					{
+					kernel_spotrf_nt_l_8x4_vs_lib8((j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_spotrf_nt_l_16x4_vs_lib8(j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_strsm_nt_rl_inv_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_strsm_nt_rl_inv_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_spotrf_nt_l_8x8_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_spotrf_nt_l_8x4_vs_lib8(j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			}
+		}
+
+	return;
+
+	}
+
+
+
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj)
+	{
+
+	if(ai!=0 | bi!=0 | ci!=0 | di!=0)
+		{
+		printf("\nssyrk_spotrf_ln_libstr: feature not implemented yet: ai=%d, bi=%d, ci=%d, di=%d\n", ai, bi, ci, di);
+		exit(1);
+		}
+
+	const int bs = 8;
+
+	int i, j;
+
+	int sda = sA->cn;
+	int sdb = sB->cn;
+	int sdc = sC->cn;
+	int sdd = sD->cn;
+	float *pA = sA->pA + aj*bs;
+	float *pB = sB->pA + bj*bs;
+	float *pC = sC->pA + cj*bs;
+	float *pD = sD->pA + dj*bs;
+	float *dD = sD->dA; // XXX what to do if di and dj are not zero
+
+//	ssyrk_spotrf_nt_l_lib(m, n, k, pA, sda, pB, sdb, pC, sdc, pD, sdd, dD);
+
+	if(di==0 && dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+
+	i = 0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; i<m-23; i+=24)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-23)
+					{
+					kernel_ssyrk_spotrf_nt_l_24x4_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+					kernel_ssyrk_spotrf_nt_l_20x4_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+					kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8]);
+					kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12]);
+					kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], (j+16), &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 23
+						{
+						kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+						kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+						if(j==n-8)
+							return;
+						if(j<n-12) // 13 - 23
+							{
+							kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+							kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], (j+12), &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+							if(j==n-16)
+								return;
+							if(j<n-20) // 21 - 23
+								{
+								kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+								}
+							else // 17 18 19 20
+								{
+								kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+								}
+							}
+						else // 9 10 11 12
+							{
+							kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else if(m-i<=16)
+			{
+			goto left_16;
+			}
+		else
+			{
+			goto left_24;
+			}
+		}
+#else
+	for(; i<m-15; i+=16)
+		{
+		j = 0;
+		for(; j<i & j<n-7; j+=8)
+			{
+			kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0]);
+			kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4]);
+			}
+		if(j<n)
+			{
+			if(i<j) // dtrsm
+				{
+				kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+				if(j<n-4) // 5 6 7
+					{
+					kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[(j+4)*bs+(j+4)*sdd], &dD[j+4], m-i, n-(j+4));
+					}
+				}
+			else // dpotrf
+				{
+				if(j<n-15)
+					{
+					kernel_ssyrk_spotrf_nt_l_16x4_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0]);
+					kernel_ssyrk_spotrf_nt_l_12x4_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4]);
+					kernel_ssyrk_spotrf_nt_l_8x8_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8]);
+					}
+				else
+					{
+					if(j<n-4) // 5 - 15
+						{
+						kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+						kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+						if(j==n-8) // 8
+							return;
+						if(j<n-12) // 13 - 15
+							{
+							kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+							}
+						else // 9 10 11 12
+							{
+							kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+							}
+						}
+					else // 1 2 3 4
+						{
+						kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+						}
+					}
+				}
+			}
+		}
+	if(m>i)
+		{
+		if(m-i<=8)
+			{
+			goto left_8;
+			}
+		else
+			{
+			goto left_16;
+			}
+		}
+#endif
+
+	// common return if i==m
+	return;
+
+	// clean up loops definitions
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+	left_24:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+		kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 23
+				{
+				kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[(j+0)*sdb], (j+0), &pD[(i+0)*sdd], sdd, &pD[(j+0)*sdd], &pC[(j+0)*bs+(j+0)*sdc], sdc, &pD[(j+0)*bs+(j+0)*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+				kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+(j+0)*sdb], (j+4), &pD[(i+0)*sdd], sdd, &pD[4+(j+0)*sdd], &pC[(j+4)*bs+(j+0)*sdc], sdc, &pD[(j+4)*bs+(j+0)*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 23
+					{
+					kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+					kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[4+(j+8)*sdb], j+12, &pD[(i+8)*sdd], sdd, &pD[4+(j+8)*sdd], &pC[(j+12)*bs+(j+8)*sdc], sdc, &pD[(j+12)*bs+(j+8)*sdd], sdd, &dD[j+12], m-(i+8), n-(j+12));
+					if(j>=n-16)
+						return;
+					if(j<n-20) // 21 - 23
+						{
+						kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+						}
+					else // 17 18 19 20
+						{
+						kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+16)*sda], &pB[(j+16)*sdb], j+16, &pD[(i+16)*sdd], &pD[(j+16)*sdd], &pC[(j+16)*bs+(j+16)*sdc], &pD[(j+16)*bs+(j+16)*sdd], &dD[j+16], m-(i+16), n-(j+16));
+						}
+					}
+				else // 9 10 11 12
+					{
+					kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+8)*sda], sda, &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], sdd, &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], sdc, &pD[(j+8)*bs+(j+8)*sdd], sdd, &dD[j+8], m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+#endif
+
+	left_16:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[0+(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+		kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[0+j*sdb], j+0, &pD[i*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+i*sdc], sdc, &pD[(j+0)*bs+i*sdd], sdd, &pD[(j+0)*bs+(j+0)*sdd], &dD[j+0], m-i, n-(j+0));
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(k, &pA[i*sda], sda, &pB[4+j*sdb], j+4, &pD[i*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+i*sdc], sdc, &pD[(j+4)*bs+i*sdd], sdd, &pD[4+(j+4)*bs+(j+0)*sdd], &dD[j+4], m-i, n-(j+4));
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 - 15
+				{
+				kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[0+j*sdb], j+0, &pD[(i+0)*sdd], sdd, &pD[0+j*sdd], &pC[(j+0)*bs+j*sdc], sdc, &pD[(j+0)*bs+j*sdd], sdd, &dD[j+0], m-(i+0), n-(j+0));
+				kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[4+j*sdb], j+4, &pD[(i+0)*sdd], sdd, &pD[4+j*sdd], &pC[(j+4)*bs+j*sdc], sdc, &pD[(j+4)*bs+j*sdd], sdd, &dD[j+4], m-(i+0), n-(j+4));
+				if(j>=n-8)
+					return;
+				if(j<n-12) // 13 - 15
+					{
+					kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], (j+8), &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+					}
+				else // 9 - 12
+					{
+					kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[(i+8)*sda], &pB[(j+8)*sdb], j+8, &pD[(i+8)*sdd], &pD[(j+8)*sdd], &pC[(j+8)*bs+(j+8)*sdc], &pD[(j+8)*bs+(j+8)*sdd], &dD[j+8], m-(i+8), n-(j+8));
+					}
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(k, &pA[(i+0)*sda], sda, &pB[j*sdb], j, &pD[(i+0)*sdd], sdd, &pD[j*sdd], &pC[j*bs+j*sdc], sdc, &pD[j*bs+j*sdd], sdd, &dD[j], m-(i+0), n-j);
+				}
+			}
+		}
+	return;
+
+	left_8:
+	j = 0;
+	for(; j<i & j<n-7; j+=8)
+		{
+		kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+		}
+	if(j<n)
+		{
+		if(j<i) // dtrsm
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+i*sdc], &pD[j*bs+i*sdd], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			}
+		else // dpotrf
+			{
+			if(j<n-4) // 5 6 7
+				{
+				kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			else // 1 2 3 4
+				{
+				kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(k, &pA[i*sda], &pB[j*sdb], j, &pD[i*sdd], &pD[j*sdd], &pC[j*bs+j*sdc], &pD[j*bs+j*sdd], &dD[j], m-i, n-j);
+				}
+			}
+		}
+	return;
+
+	}
+
+
+
+int sgeqrf_work_size_libstr(int m, int n)
+	{
+	printf("\nsgeqrf_work_size_libstr: feature not implemented yet\n");
+	exit(1);
+	return 0;
+	}
+
+
+
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	printf("\nsgeqrf_libstr: feature not implemented yet\n");
+	exit(1);
+	return;
+	}
+
+
+
+
diff --git a/blas/x_blas1_lib.c b/blas/x_blas1_lib.c
new file mode 100644
index 0000000..5f8fc2e
--- /dev/null
+++ b/blas/x_blas1_lib.c
@@ -0,0 +1,186 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	if(m<=0)
+		return;
+	int ii;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+		z[ii+1] = y[ii+1] + alpha*x[ii+1];
+		z[ii+2] = y[ii+2] + alpha*x[ii+2];
+		z[ii+3] = y[ii+3] + alpha*x[ii+3];
+		}
+	for(; ii<m; ii++)
+		z[ii+0] = y[ii+0] + alpha*x[ii+0];
+	return;
+	}
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	if(m<=0)
+		return 0.0;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	int ii;
+	REAL dot = 0.0;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		z[ii+1] = x[ii+1] * y[ii+1];
+		z[ii+2] = x[ii+2] * y[ii+2];
+		z[ii+3] = x[ii+3] * y[ii+3];
+		dot += z[ii+0] + z[ii+1] + z[ii+2] + z[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		dot += z[ii+0];
+		}
+	return dot;
+	}
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+	{
+	if(m<=0)
+		return 0.0;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	int ii;
+	REAL dot = 0.0;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		dot += x[ii+0] * y[ii+0];
+		dot += x[ii+1] * y[ii+1];
+		dot += x[ii+2] * y[ii+2];
+		dot += x[ii+3] * y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		dot += x[ii+0] * y[ii+0];
+		}
+	return dot;
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void AXPY_LIBSTR(int m, REAL alpha, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	if(m<=0)
+		return;
+	int i1 = 1;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	if(y!=z)
+		COPY(&m, y, &i1, z, &i1);
+	AXPY(&m, &alpha, x, &i1, z, &i1);
+	return;
+	}
+
+
+
+// multiply two vectors and compute dot product
+REAL VECMULDOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	if(m<=0)
+		return 0.0;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	int ii;
+	REAL dot = 0.0;
+	ii = 0;
+	for(; ii<m; ii++)
+		{
+		z[ii+0] = x[ii+0] * y[ii+0];
+		dot += z[ii+0];
+		}
+	return dot;
+	}
+
+
+
+// compute dot product of two vectors
+REAL DOT_LIBSTR(int m, struct STRVEC *sx, int xi, struct STRVEC *sy, int yi)
+	{
+	if(m<=0)
+		return 0.0;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	int ii;
+	REAL dot = 0.0;
+	ii = 0;
+	for(; ii<m-3; ii+=4)
+		{
+		dot += x[ii+0] * y[ii+0];
+		dot += x[ii+1] * y[ii+1];
+		dot += x[ii+2] * y[ii+2];
+		dot += x[ii+3] * y[ii+3];
+		}
+	for(; ii<m; ii++)
+		{
+		dot += x[ii+0] * y[ii+0];
+		}
+	return dot;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas2_diag_lib.c b/blas/x_blas2_diag_lib.c
new file mode 100644
index 0000000..e90cbd6
--- /dev/null
+++ b/blas/x_blas2_diag_lib.c
@@ -0,0 +1,51 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+void GEMV_DIAG_LIBSTR(int m, REAL alpha, struct STRVEC *sA, int ai, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	if(m<=0)
+		return;
+	int ii;
+	REAL *a = sA->pa + ai;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	if(alpha==1.0 & beta==1.0)
+		{
+		for(ii=0; ii<m; ii++)
+			z[ii] = a[ii]*x[ii] + y[ii];
+		}
+	else
+		{
+		for(ii=0; ii<m; ii++)
+			z[ii] = alpha*a[ii]*x[ii] + beta*y[ii];
+		}
+
+	return;
+
+	}
diff --git a/blas/x_blas2_lib.c b/blas/x_blas2_lib.c
new file mode 100644
index 0000000..32e1e0a
--- /dev/null
+++ b/blas/x_blas2_lib.c
@@ -0,0 +1,1466 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL 
+		y_0, y_1, y_2, y_3,
+		x_0, x_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+#if 1 // y reg version
+	ii = 0;
+	for(; ii<m-1; ii+=2)
+		{
+		y_0 = 0.0;
+		y_1 = 0.0;
+		jj = 0;
+		for(; jj<n-1; jj+=2)
+			{
+			y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+			y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+			}
+		if(jj<n)
+			{
+			y_0 += pA[ii+0+lda*jj] * x[jj];
+			y_1 += pA[ii+1+lda*jj] * x[jj];
+			}
+		z[ii+0] = beta * y[ii+0] + alpha * y_0;
+		z[ii+1] = beta * y[ii+1] + alpha * y_1;
+		}
+	for(; ii<m; ii++)
+		{
+		y_0 = 0.0;
+		for(jj=0; jj<n; jj++)
+			{
+			y_0 += pA[ii+lda*jj] * x[jj];
+			}
+		z[ii] = beta * y[ii] + alpha * y_0;
+		}
+#else // x reg version
+	for(ii=0; ii<n; ii++)
+		{
+		z[ii] = beta * y[ii];
+		}
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		x_0 = alpha * x[jj+0];
+		x_1 = alpha * x[jj+1];
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+			z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+			}
+		for(; ii<m; ii++)
+			{
+			z[ii] += pA[ii+lda*(jj+0)] * x_0;
+			z[ii] += pA[ii+lda*(jj+1)] * x_1;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		x_0 = alpha * x[jj+0];
+		for(ii=0; ii<m; ii++)
+			{
+			z[ii] += pA[ii+lda*(jj+0)] * x_0;
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL 
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		y_0 = 0.0;
+		y_1 = 0.0;
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+			y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+			}
+		if(ii<m)
+			{
+			y_0 += pA[ii+lda*(jj+0)] * x[ii];
+			y_1 += pA[ii+lda*(jj+1)] * x[ii];
+			}
+		z[jj+0] = beta * y[jj+0] + alpha * y_0;
+		z[jj+1] = beta * y[jj+1] + alpha * y_1;
+		}
+	for(; jj<n; jj++)
+		{
+		y_0 = 0.0;
+		for(ii=0; ii<m; ii++)
+			{
+			y_0 += pA[ii+lda*(jj+0)] * x[ii];
+			}
+		z[jj+0] = beta * y[jj+0] + alpha * y_0;
+		}
+	return;
+	}
+
+
+
+// TODO optimize !!!!!
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+	{
+	int ii, jj;
+	REAL
+		a_00,
+		x_n_0,
+		y_t_0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x_n = sx_n->pa + xi_n;
+	REAL *x_t = sx_t->pa + xi_t;
+	REAL *y_n = sy_n->pa + yi_n;
+	REAL *y_t = sy_t->pa + yi_t;
+	REAL *z_n = sz_n->pa + zi_n;
+	REAL *z_t = sz_t->pa + zi_t;
+	for(ii=0; ii<m; ii++)
+		{
+		z_n[ii] = beta_n * y_n[ii];
+		}
+	for(jj=0; jj<n; jj++)
+		{
+		y_t_0 = 0.0;
+		x_n_0 = alpha_n * x_n[jj];
+		for(ii=0; ii<m; ii++)
+			{
+			a_00 = pA[ii+lda*jj];
+			z_n[ii] += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t[ii];
+			}
+		z_t[jj] = beta_t * y_t[jj] + alpha_t * y_t_0;
+		}
+	return;
+	}
+
+
+
+// TODO optimize !!!!!
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL
+		y_0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	for(ii=0; ii<n; ii++)
+		{
+		y_0 = 0.0;
+		jj = 0;
+		for(; jj<=ii; jj++)
+			{
+			y_0 += pA[ii+lda*jj] * x[jj];
+			}
+		for( ; jj<m; jj++)
+			{
+			y_0 += pA[jj+lda*ii] * x[jj];
+			}
+		z[ii] = beta * y[ii] + alpha * y_0;
+		}
+	return;
+	}
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(m-n>0)
+		{
+		GEMV_N_LIBSTR(m-n, n, 1.0, sA, ai+n, aj, sx, xi, 0.0, sz, zi+n, sz, zi+n);
+		}
+	if(n%2!=0)
+		{
+		ii = n-1;
+		y_0 = x[ii];
+		y_0 *= pA[ii+lda*ii];
+		for(jj=0; jj<ii; jj++)
+			{
+			y_0 += pA[ii+lda*jj] * x[jj];
+			}
+		z[ii] = y_0;
+		n -= 1;
+		}
+	for(ii=n-2; ii>=0; ii-=2)
+		{
+		y_0 = x[ii+0];
+		y_1 = x[ii+1];
+		y_1 *= pA[ii+1+lda*(ii+1)];
+		y_1 += pA[ii+1+lda*(ii+0)] * y_0;
+		y_0 *= pA[ii+0+lda*(ii+0)];
+		jj = 0;
+		for(; jj<ii-1; jj+=2)
+			{
+			y_0 += pA[ii+0+lda*(jj+0)] * x[jj+0] + pA[ii+0+lda*(jj+1)] * x[jj+1];
+			y_1 += pA[ii+1+lda*(jj+0)] * x[jj+0] + pA[ii+1+lda*(jj+1)] * x[jj+1];
+			}
+//	XXX there is no clean up loop !!!!!
+//		for(; jj<ii; jj++)
+//			{
+//			y_0 += pA[ii+0+lda*jj] * x[jj];
+//			y_1 += pA[ii+1+lda*jj] * x[jj];
+//			}
+		z[ii+0] = y_0;
+		z[ii+1] = y_1;
+		}
+	return;
+	}
+
+
+	
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		y_0 = x[jj+0];
+		y_1 = x[jj+1];
+		y_0 *= pA[jj+0+lda*(jj+0)];
+		y_0 += pA[jj+1+lda*(jj+0)] * y_1;
+		y_1 *= pA[jj+1+lda*(jj+1)];
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+			y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+			}
+		for(; ii<m; ii++)
+			{
+			y_0 += pA[ii+lda*(jj+0)] * x[ii];
+			y_1 += pA[ii+lda*(jj+1)] * x[ii];
+			}
+		z[jj+0] = y_0;
+		z[jj+1] = y_1;
+		}
+	for(; jj<n; jj++)
+		{
+		y_0 = x[jj];
+		y_0 *= pA[jj+lda*jj];
+		for(ii=jj+1; ii<m; ii++)
+			{
+			y_0 += pA[ii+lda*jj] * x[ii];
+			}
+		z[jj] = y_0;
+		}
+	return;
+	}
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL
+		y_0, y_1,
+		x_0, x_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+#if 1 // y reg version
+	jj = 0;
+	for(; jj<m-1; jj+=2)
+		{
+		y_0 = x[jj+0];
+		y_1 = x[jj+1];
+		y_0 = pA[jj+0+lda*(jj+0)] * y_0;
+		y_0 += pA[jj+0+lda*(jj+1)] * y_1;
+		y_1 = pA[jj+1+lda*(jj+1)] * y_1;
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0] + pA[jj+0+lda*(ii+1)] * x[ii+1];
+			y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0] + pA[jj+1+lda*(ii+1)] * x[ii+1];
+			}
+		if(ii<m)
+			{
+			y_0 += pA[jj+0+lda*(ii+0)] * x[ii+0];
+			y_1 += pA[jj+1+lda*(ii+0)] * x[ii+0];
+			}
+		z[jj+0] = y_0;
+		z[jj+1] = y_1;
+		}
+	for(; jj<m; jj++)
+		{
+		y_0 = pA[jj+lda*jj] * x[jj];
+		for(ii=jj+1; ii<m; ii++)
+			{
+			y_0 += pA[jj+lda*ii] * x[ii];
+			}
+		z[jj] = y_0;
+		}
+#else // x reg version
+	if(x != z)
+		{
+		for(ii=0; ii<m; ii++)
+			z[ii] = x[ii];
+		}
+	jj = 0;
+	for(; jj<m-1; jj+=2)
+		{
+		x_0 = z[jj+0];
+		x_1 = z[jj+1];
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+			z[ii+1] += pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+			}
+//	XXX there is no clean-up loop, since jj+=2 !!!!!
+//		for(; ii<jj; ii++)
+//			{
+//			z[ii+0] += pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+//			}
+		x_0 *= pA[jj+0+lda*(jj+0)];
+		x_0 += pA[jj+0+lda*(jj+1)] * x_1;
+		x_1 *= pA[jj+1+lda*(jj+1)];
+		z[jj+0] = x_0;
+		z[jj+1] = x_1;
+		}
+	for(; jj<m; jj++)
+		{
+		x_0 = z[jj];
+		for(ii=0; ii<jj; ii++)
+			{
+			z[ii] += pA[ii+lda*jj] * x_0;
+			}
+		x_0 *= pA[jj+lda*jj];
+		z[jj] = x_0;
+		}
+#endif
+	return;
+	}
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	int ii, jj;
+	REAL
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(m%2!=0)
+		{
+		jj = m-1;
+		y_0 = pA[jj+lda*jj] * x[jj];
+		for(ii=0; ii<jj; ii++)
+			{
+			y_0 += pA[ii+lda*jj] * x[ii];
+			}
+		z[jj] = y_0;
+		m -= 1; // XXX
+		}
+	for(jj=m-2; jj>=0; jj-=2)
+		{
+		y_1 = pA[jj+1+lda*(jj+1)] * x[jj+1];
+		y_1 += pA[jj+0+lda*(jj+1)] * x[jj+0];
+		y_0 = pA[jj+0+lda*(jj+0)] * x[jj+0];
+		for(ii=0; ii<jj-1; ii+=2)
+			{
+			y_0 += pA[ii+0+lda*(jj+0)] * x[ii+0] + pA[ii+1+lda*(jj+0)] * x[ii+1];
+			y_1 += pA[ii+0+lda*(jj+1)] * x[ii+0] + pA[ii+1+lda*(jj+1)] * x[ii+1];
+			}
+//	XXX there is no clean-up loop !!!!!
+//		if(ii<jj)
+//			{
+//			y_0 += pA[ii+lda*(jj+0)] * x[ii];
+//			y_1 += pA[ii+lda*(jj+1)] * x[ii];
+//			}
+		z[jj+0] = y_0;
+		z[jj+1] = y_1;
+		}
+	return;
+	}
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0 | n==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	int ii, jj, j1;
+	REAL
+		y_0, y_1,
+		x_0, x_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *dA = sA->dA;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+#if 1 // y reg version
+	ii = 0;
+	for(; ii<n-1; ii+=2)
+		{
+		y_0 = x[ii+0];
+		y_1 = x[ii+1];
+		jj = 0;
+		for(; jj<ii-1; jj+=2)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+			y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+			}
+//	XXX there is no clean-up loop !!!!!
+//		if(jj<ii)
+//			{
+//			y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+//			y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+//			}
+		y_0 *= dA[ii+0];
+		y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+		y_1 *= dA[ii+1];
+		z[ii+0] = y_0;
+		z[ii+1] = y_1;
+		}
+	for(; ii<n; ii++)
+		{
+		y_0 = x[ii];
+		for(jj=0; jj<ii; jj++)
+			{
+			y_0 -= pA[ii+lda*jj] * z[jj];
+			}
+		y_0 *= dA[ii];
+		z[ii] = y_0;
+		}
+	for(; ii<m-1; ii+=2)
+		{
+		y_0 = x[ii+0];
+		y_1 = x[ii+1];
+		jj = 0;
+		for(; jj<n-1; jj+=2)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+			y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+			}
+		if(jj<n)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0];
+			y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0];
+			}
+		z[ii+0] = y_0;
+		z[ii+1] = y_1;
+		}
+	for(; ii<m; ii++)
+		{
+		y_0 = x[ii];
+		for(jj=0; jj<n; jj++)
+			{
+			y_0 -= pA[ii+lda*jj] * z[jj];
+			}
+		z[ii] = y_0;
+		}
+#else // x reg version
+	if(x != z)
+		{
+		for(ii=0; ii<m; ii++)
+			z[ii] = x[ii];
+		}
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		x_0 = dA[jj+0] * z[jj+0];
+		x_1 = z[jj+1] - pA[jj+1+lda*(jj+0)] * x_0;
+		x_1 = dA[jj+1] * x_1;
+		z[jj+0] = x_0;
+		z[jj+1] = x_1;
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			z[ii+0] -= pA[ii+0+lda*(jj+0)] * x_0 + pA[ii+0+lda*(jj+1)] * x_1;
+			z[ii+1] -= pA[ii+1+lda*(jj+0)] * x_0 + pA[ii+1+lda*(jj+1)] * x_1;
+			}
+		for(; ii<m; ii++)
+			{
+			z[ii] -= pA[ii+lda*(jj+0)] * x_0 + pA[ii+lda*(jj+1)] * x_1;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		x_0 = dA[jj] * z[jj];
+		z[jj] = x_0;
+		for(ii=jj+1; ii<m; ii++)
+			{
+			z[ii] -= pA[ii+lda*jj] * x_0;
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	int ii, jj;
+	REAL
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *dA = sA->dA;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+	if(n%2!=0)
+		{
+		jj = n-1;
+		y_0 = x[jj];
+		for(ii=jj+1; ii<m; ii++)
+			{
+			y_0 -= pA[ii+lda*jj] * z[ii];
+			}
+		y_0 *= dA[jj];
+		z[jj] = y_0;
+		jj -= 2;
+		}
+	else
+		{
+		jj = n-2;
+		}
+	for(; jj>=0; jj-=2)
+		{
+		y_0 = x[jj+0];
+		y_1 = x[jj+1];
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+			y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+			}
+		if(ii<m)
+			{
+			y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+			y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+			}
+		y_1 *= dA[jj+1];
+		y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+		y_0 *= dA[jj+0];
+		z[jj+0] = y_0;
+		z[jj+1] = y_1;
+		}
+	return;
+	}
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	int ii, jj, j1;
+	REAL
+		y_0, y_1,
+		x_0, x_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *dA = sA->dA;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+	ii = 0;
+	for(; ii<m-1; ii+=2)
+		{
+		y_0 = x[ii+0];
+		y_1 = x[ii+1];
+		jj = 0;
+		for(; jj<ii-1; jj+=2)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[jj+0] + pA[ii+0+lda*(jj+1)] * z[jj+1];
+			y_1 -= pA[ii+1+lda*(jj+0)] * z[jj+0] + pA[ii+1+lda*(jj+1)] * z[jj+1];
+			}
+		y_0 *= dA[ii+0];
+		y_1 -= pA[ii+1+lda*(jj+0)] * y_0;
+		y_1 *= dA[ii+1];
+		z[ii+0] = y_0;
+		z[ii+1] = y_1;
+		}
+	for(; ii<m; ii++)
+		{
+		y_0 = x[ii];
+		for(jj=0; jj<ii; jj++)
+			{
+			y_0 -= pA[ii+lda*jj] * z[jj];
+			}
+		y_0 *= dA[ii];
+		z[ii] = y_0;
+		}
+	return;
+	}
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** trsv_lnu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	int ii, jj;
+	REAL
+		y_0, y_1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *dA = sA->dA;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<m; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+	if(m%2!=0)
+		{
+		jj = m-1;
+		y_0 = x[jj];
+		y_0 *= dA[jj];
+		z[jj] = y_0;
+		jj -= 2;
+		}
+	else
+		{
+		jj = m-2;
+		}
+	for(; jj>=0; jj-=2)
+		{
+		y_0 = x[jj+0];
+		y_1 = x[jj+1];
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			y_0 -= pA[ii+0+lda*(jj+0)] * z[ii+0] + pA[ii+1+lda*(jj+0)] * z[ii+1];
+			y_1 -= pA[ii+0+lda*(jj+1)] * z[ii+0] + pA[ii+1+lda*(jj+1)] * z[ii+1];
+			}
+		if(ii<m)
+			{
+			y_0 -= pA[ii+lda*(jj+0)] * z[ii];
+			y_1 -= pA[ii+lda*(jj+1)] * z[ii];
+			}
+		y_1 *= dA[jj+1];
+		y_0 -= pA[jj+1+lda*(jj+0)] * y_1;
+		y_0 *= dA[jj+0];
+		z[jj+0] = y_0;
+		z[jj+1] = y_1;
+		}
+	return;
+	}
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** trsv_ltu_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** trsv_unn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	printf("\n***** trsv_utn_libstr : feature not implemented yet *****\n");
+	exit(1);
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+void GEMV_N_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, y, &i1, z, &i1);
+	GEMV(&cn, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+	return;
+	}
+
+
+
+void GEMV_T_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	COPY(&n, y, &i1, z, &i1);
+	GEMV(&ct, &m, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+	return;
+	}
+
+
+
+void GEMV_NT_LIBSTR(int m, int n, REAL alpha_n, REAL alpha_t, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx_n, int xi_n, struct STRVEC *sx_t, int xi_t, REAL beta_n, REAL beta_t, struct STRVEC *sy_n, int yi_n, struct STRVEC *sy_t, int yi_t, struct STRVEC *sz_n, int zi_n, struct STRVEC *sz_t, int zi_t)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x_n = sx_n->pa + xi_n;
+	REAL *x_t = sx_t->pa + xi_t;
+	REAL *y_n = sy_n->pa + yi_n;
+	REAL *y_t = sy_t->pa + yi_t;
+	REAL *z_n = sz_n->pa + zi_n;
+	REAL *z_t = sz_t->pa + zi_t;
+	COPY(&m, y_n, &i1, z_n, &i1);
+	GEMV(&cn, &m, &n, &alpha_n, pA, &lda, x_n, &i1, &beta_n, z_n, &i1);
+	COPY(&n, y_t, &i1, z_t, &i1);
+	GEMV(&ct, &m, &n, &alpha_t, pA, &lda, x_t, &i1, &beta_t, z_t, &i1);
+	return;
+	}
+
+
+
+void SYMV_L_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, REAL beta, struct STRVEC *sy, int yi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *y = sy->pa + yi;
+	REAL *z = sz->pa + zi;
+	int tmp = m-n;
+	COPY(&m, y, &i1, z, &i1);
+	SYMV(&cl, &n, &alpha, pA, &lda, x, &i1, &beta, z, &i1);
+	GEMV(&cn, &tmp, &n, &alpha, pA+n, &lda, x, &i1, &beta, z+n, &i1);
+	GEMV(&ct, &tmp, &n, &alpha, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+	return;
+	}
+
+
+
+void TRMV_LNN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL d0 = 0.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	int tmp = m-n;
+	if(x!=z)
+		COPY(&n, x, &i1, z, &i1);
+	GEMV(&cn, &tmp, &n, &d1, pA+n, &lda, x, &i1, &d0, z+n, &i1);
+	TRMV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRMV_LTN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	int tmp = m-n;
+	if(x!=z)
+		COPY(&n, x, &i1, z, &i1);
+	TRMV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+	GEMV(&ct, &tmp, &n, &d1, pA+n, &lda, x+n, &i1, &d1, z, &i1);
+	return;
+	}
+
+
+
+void TRMV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRMV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRMV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRMV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_LNN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0 | n==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** trsv_lnn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** trsv_lnn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int mmn = m-n;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cl, &cn, &cn, &n, pA, &lda, z, &i1);
+	GEMV(&cn, &mmn, &n, &dm1, pA+n, &lda, z, &i1, &d1, z+n, &i1);
+	return;
+	}
+
+
+
+void TRSV_LTN_MN_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltn_mn_libstr : m<0 : %d<0 *****\n", m);
+	if(n<0) printf("\n****** trsv_ltn_mn_libstr : n<0 : %d<0 *****\n", n);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltn_mn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltn_mn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltn_mn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltn_mn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltn_mn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+n > sA->n) printf("\n***** trsv_ltn_mn_libstr : aj+n > col(A) : %d+%d > %d *****\n", aj, n, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltn_mn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltn_mn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int mmn = m-n;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	GEMV(&ct, &mmn, &n, &dm1, pA+n, &lda, z+n, &i1, &d1, z, &i1);
+	TRSV(&cl, &ct, &cn, &n, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_LNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_lnn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cl, &cn, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_LNU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_lnu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_lnu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_lnu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_lnu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_lnu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_lnu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_lnu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_lnu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_lnu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cl, &cn, &cu, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_LTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_ltn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cl, &ct, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_LTU_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_ltu_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_ltu_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_ltu_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_ltu_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_ltu_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_ltu_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_ltu_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_ltu_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_ltu_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cl, &ct, &cu, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_UNN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_unn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_unn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_unn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_unn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_unn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_unn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_unn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_unn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_unn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cu, &cn, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+void TRSV_UTN_LIBSTR(int m, struct STRMAT *sA, int ai, int aj, struct STRVEC *sx, int xi, struct STRVEC *sz, int zi)
+	{
+	if(m==0)
+		return;
+#if defined(DIM_CHECK)
+	// non-negative size
+	if(m<0) printf("\n****** trsv_utn_libstr : m<0 : %d<0 *****\n", m);
+	// non-negative offset
+	if(ai<0) printf("\n****** trsv_utn_libstr : ai<0 : %d<0 *****\n", ai);
+	if(aj<0) printf("\n****** trsv_utn_libstr : aj<0 : %d<0 *****\n", aj);
+	if(xi<0) printf("\n****** trsv_utn_libstr : xi<0 : %d<0 *****\n", xi);
+	if(zi<0) printf("\n****** trsv_utn_libstr : zi<0 : %d<0 *****\n", zi);
+	// inside matrix
+	// A: m x k
+	if(ai+m > sA->m) printf("\n***** trsv_utn_libstr : ai+m > row(A) : %d+%d > %d *****\n", ai, m, sA->m);
+	if(aj+m > sA->n) printf("\n***** trsv_utn_libstr : aj+m > col(A) : %d+%d > %d *****\n", aj, m, sA->n);
+	// x: m
+	if(xi+m > sx->m) printf("\n***** trsv_utn_libstr : xi+m > size(x) : %d+%d > %d *****\n", xi, m, sx->m);
+	// z: m
+	if(zi+m > sz->m) printf("\n***** trsv_utn_libstr : zi+m > size(z) : %d+%d > %d *****\n", zi, m, sz->m);
+#endif
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL d1 = 1.0;
+	REAL dm1 = -1.0;
+	int lda = sA->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *x = sx->pa + xi;
+	REAL *z = sz->pa + zi;
+	COPY(&m, x, &i1, z, &i1);
+	TRSV(&cu, &ct, &cn, &m, pA, &lda, z, &i1);
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
diff --git a/blas/x_blas3_diag_lib.c b/blas/x_blas3_diag_lib.c
new file mode 100644
index 0000000..d5cce93
--- /dev/null
+++ b/blas/x_blas3_diag_lib.c
@@ -0,0 +1,170 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE) | defined(LA_BLAS) 
+
+
+
+// dgemm with A diagonal matrix (stored as strvec)
+void GEMM_L_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRVEC *sA, int ai, struct STRMAT *sB, int bi, int bj, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *dA = sA->pa + ai;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL a0, a1;
+	if(beta==0.0)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			a0 = alpha * dA[ii+0];
+			a1 = alpha * dA[ii+1];
+			for(jj=0; jj<n; jj++)
+				{
+				pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+				pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj];
+				}
+			}
+		for(; ii<m; ii++)
+			{
+			a0 = alpha * dA[ii];
+			for(jj=0; jj<n; jj++)
+				{
+				pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj];
+				}
+			}
+		}
+	else
+		{
+		int ldc = sC->m;
+		REAL *pC = sC->pA + ci + cj*ldc;
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			a0 = alpha * dA[ii+0];
+			a1 = alpha * dA[ii+1];
+			for(jj=0; jj<n; jj++)
+				{
+				pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+				pD[ii+1+ldd*jj] = a1 * pB[ii+1+ldb*jj] + beta * pC[ii+1+ldc*jj];
+				}
+			}
+		for(; ii<m; ii++)
+			{
+			a0 = alpha * dA[ii];
+			for(jj=0; jj<n; jj++)
+				{
+				pD[ii+0+ldd*jj] = a0 * pB[ii+0+ldb*jj] + beta * pC[ii+0+ldc*jj];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// dgemm with B diagonal matrix (stored as strvec)
+void GEMM_R_DIAG_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRVEC *sB, int bi, double beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj;
+	int lda = sA->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *dB = sB->pa + bi;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL a0, a1;
+	if(beta==0)
+		{
+		jj = 0;
+		for(; jj<n-1; jj+=2)
+			{
+			a0 = alpha * dB[jj+0];
+			a1 = alpha * dB[jj+1];
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+				pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)];
+				}
+			}
+		for(; jj<n; jj++)
+			{
+			a0 = alpha * dB[jj+0];
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)];
+				}
+			}
+		}
+	else
+		{
+		int ldc = sC->m;
+		REAL *pC = sC->pA + ci + cj*ldc;
+		jj = 0;
+		for(; jj<n-1; jj+=2)
+			{
+			a0 = alpha * dB[jj+0];
+			a1 = alpha * dB[jj+1];
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+				pD[ii+ldd*(jj+1)] = a1 * pA[ii+lda*(jj+1)] + beta * pC[ii+ldc*(jj+1)];
+				}
+			}
+		for(; jj<n; jj++)
+			{
+			a0 = alpha * dB[jj+0];
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*(jj+0)] = a0 * pA[ii+lda*(jj+0)] + beta * pC[ii+ldc*(jj+0)];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_blas3_lib.c b/blas/x_blas3_lib.c
new file mode 100644
index 0000000..29a33c7
--- /dev/null
+++ b/blas/x_blas3_lib.c
@@ -0,0 +1,1531 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+				c_11 += pA[(ii+1)+lda*kk] * pB[(jj+1)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_01 += pA[(ii+0)+lda*kk] * pB[(jj+1)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				c_10 += pA[(ii+1)+lda*kk] * pB[(jj+0)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[(jj+0)+ldb*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			}
+		}
+	return;
+	}
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			c_01 = 0.0; ;
+			c_11 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+				c_11 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11 + beta * pC[(ii+1)+ldc*(jj+1)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			c_01 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_01 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01 + beta * pC[(ii+0)+ldc*(jj+1)];
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				c_10 += pA[(ii+1)+lda*kk] * pB[kk+ldb*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10 + beta * pC[(ii+1)+ldc*(jj+0)];
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[(ii+0)+lda*kk] * pB[kk+ldb*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00 + beta * pC[(ii+0)+ldc*(jj+0)];
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		d_00, d_01,
+		d_10, d_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda; // triangular
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+#if 1
+	// solve
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			d_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			d_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			d_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			kk = 0;
+#if 0
+			for(; kk<ii-1; kk+=2)
+				{
+				d_00 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*(kk+0)] * pD[kk+ldd*(jj+1)];
+				d_00 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*(kk+1)] * pD[kk+ldd*(jj+1)];
+				}
+			if(kk<ii)
+#else
+			for(; kk<ii; kk++)
+#endif
+				{
+				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+0+lda*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pA[ii+1+lda*kk] * pD[kk+ldd*(jj+1)];
+				}
+			d_10 -= pA[ii+1+lda*kk] * d_00;
+			d_11 -= pA[ii+1+lda*kk] * d_01;
+			pD[ii+0+ldd*(jj+0)] = d_00;
+			pD[ii+1+ldd*(jj+0)] = d_10;
+			pD[ii+0+ldd*(jj+1)] = d_01;
+			pD[ii+1+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = alpha * pB[ii+ldb*(jj+0)];
+			d_01 = alpha * pB[ii+ldb*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pA[ii+lda*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[ii+ldd*(jj+0)] = d_00;
+			pD[ii+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = alpha * pB[ii+0+ldb*jj];
+			d_10 = alpha * pB[ii+1+ldb*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+0+lda*kk] * pD[kk+ldd*jj];
+				d_10 -= pA[ii+1+lda*kk] * pD[kk+ldd*jj];
+				}
+			d_10 -= pA[ii+1+lda*kk] * d_00;
+			pD[ii+0+ldd*jj] = d_00;
+			pD[ii+1+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pA[ii+lda*kk] * pD[kk+ldd*jj];
+				}
+			pD[ii+ldd*jj] = d_00;
+			}
+		}
+#else
+	// copy
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			for(ii=0; ii<m; ii++)
+				pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+		}
+	for(jj=0; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[ii+ldd*jj];
+			for(kk=ii+1; kk<m; kk++)
+				{
+				pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+				}
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, id;
+	REAL
+		d_00, d_01,
+		d_10, d_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda; // triangular
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dA = sA->dA;
+	if(!(sA->use_dA==1 & ai==0 & aj==0))
+		{
+		// inverte diagonal of pA
+		for(ii=0; ii<m; ii++)
+			dA[ii] = 1.0/pA[ii+lda*ii];
+		// use only now
+		sA->use_dA = 0;
+		}
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			id = m-ii-2;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_10 = alpha * pB[id+1+ldb*(jj+0)];
+			d_01 = alpha * pB[id+0+ldb*(jj+1)];
+			d_11 = alpha * pB[id+1+ldb*(jj+1)];
+			kk = id+2;
+#if 0
+			for(; kk<m-1; kk+=2)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_00 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+1)] * pD[kk+1+ldd*(jj+1)];
+				}
+			if(kk<m)
+#else
+			for(; kk<m; kk++)
+#endif
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				d_11 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				}
+			d_10 *= dA[id+1];
+			d_11 *= dA[id+1];
+			d_00 -= pA[id+0+lda*(id+1)] * d_10;
+			d_01 -= pA[id+0+lda*(id+1)] * d_11;
+			d_00 *= dA[id+0];
+			d_01 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+1+ldd*(jj+0)] = d_10;
+			pD[id+0+ldd*(jj+1)] = d_01;
+			pD[id+1+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			id = m-ii-1;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_01 = alpha * pB[id+0+ldb*(jj+1)];
+			kk = id+1;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_01 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+1)];
+				}
+			d_00 *= dA[id+0];
+			d_01 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+0+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			id = m-ii-2;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			d_10 = alpha * pB[id+1+ldb*(jj+0)];
+			kk = id+2;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				d_10 -= pA[id+1+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				}
+			d_10 *= dA[id+1];
+			d_00 -= pA[id+0+lda*(id+1)] * d_10;
+			d_00 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			pD[id+1+ldd*(jj+0)] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			id = m-ii-1;
+			d_00 = alpha * pB[id+0+ldb*(jj+0)];
+			kk = id+1;
+			for(; kk<m; kk++)
+				{
+				d_00 -= pA[id+0+lda*(kk+0)] * pD[kk+0+ldd*(jj+0)];
+				}
+			d_00 *= dA[id+0];
+			pD[id+0+ldd*(jj+0)] = d_00;
+			}
+		}
+#else
+	// copy
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			for(ii=0; ii<m; ii++)
+				pD[ii+ldd*jj] = alpha * pB[ii+ldb*jj];
+		}
+	// solve
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=m-1; ii>=0; ii--)
+			{
+			d_00 = pD[ii+ldd*jj] * dA[ii];
+			pD[ii+ldd*jj] = d_00;
+			for(kk=0; kk<ii; kk++)
+				{
+				pD[kk+ldd*jj] -= pA[kk+lda*ii] * d_00;
+				}
+			}
+		}
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL
+		f_10,
+		c_00, c_01,
+		c_10, c_11;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		f_10 = pA[jj+1+lda*(jj+0)];
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			pD[ii+1+ldd*(jj+1)] = c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		for(ii=0; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+				}
+			pD[ii+ldd*jj] = c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dA = sA->dA;
+	if(ai==0 & aj==0)
+		{
+		if(sA->use_dA!=1)
+			{
+			for(ii=0; ii<n; ii++)
+				dA[ii] = 1.0 / pA[ii+lda*ii];
+			sA->use_dA = 1;
+			}
+		}
+	else
+		{
+		for(ii=0; ii<n; ii++)
+			dA[ii] = 1.0 / pA[ii+lda*ii];
+		sA->use_dA = 0;
+		}
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		f_00_inv = dA[jj+0];
+		f_10 = pA[jj+1+lda*(jj+0)];
+		f_11_inv = dA[jj+1];
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_10 = alpha * pB[ii+1+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			c_11 = alpha * pB[ii+1+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			c_01 *= f_11_inv;
+			c_11 *= f_11_inv;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			pD[ii+1+ldd*(jj+1)] = c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+0+ldb*(jj+0)];
+			c_01 = alpha * pB[ii+0+ldb*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pA[jj+0+lda*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pA[jj+1+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			c_01 *= f_11_inv;
+			pD[ii+0+ldd*(jj+1)] = c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		f_00_inv = dA[jj];
+		for(ii=0; ii<m; ii++)
+			{
+			c_00 = alpha * pB[ii+ldb*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pA[jj+lda*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+ldd*jj] = c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	int i1 = 1;
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	printf("\ndtrsm_rutn_libstr: feature not implemented yet\n");
+	exit(1);
+//	if(!(pB==pD))
+//		{
+//		for(jj=0; jj<n; jj++)
+//			COPY(&m, pB+jj*sB->m, &i1, pD+jj*sD->m, &i1);
+//		}
+//	TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &(sA->m), pD, &(sD->m));
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+			c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+				c_11 += pB[(ii+1)+ldb*kk] * pA[(jj+1)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[(jj+1)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[(jj+0)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[(jj+0)+lda*kk];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			}
+		}	
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL 
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			c_01 = 0.0; ;
+			c_11 = 0.0; ;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+			c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+				c_11 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			pD[(ii+1)+ldd*(jj+1)] = alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			c_01 = 0.0; ;
+			kk = jj;
+			c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+			kk++;
+			for(; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_01 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+0)+ldd*(jj+1)] = alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0; ;
+			c_10 = 0.0; ;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				c_10 += pB[(ii+1)+ldb*kk] * pA[kk+lda*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			pD[(ii+1)+ldd*(jj+0)] = alpha * c_10;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0; ;
+			for(kk=jj; kk<n; kk++)
+				{
+				c_00 += pB[(ii+0)+ldb*kk] * pA[kk+lda*(jj+0)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int ii, jj, kk;
+	int n = m; // TODO optimize for this case !!!!!!!!!
+	REAL
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// diagonal
+		c_00 = 0.0;
+		c_10 = 0.0;
+		c_11 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+		// lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// diagonal
+		c_00 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+		// lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// diagonal
+		c_00 = 0.0;
+		c_10 = 0.0;
+		c_11 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		pD[jj+0+ldd*(jj+0)] = beta * pC[jj+0+ldc*(jj+0)] + alpha * c_00;
+		pD[jj+1+ldd*(jj+0)] = beta * pC[jj+1+ldc*(jj+0)] + alpha * c_10;
+		pD[jj+1+ldd*(jj+1)] = beta * pC[jj+1+ldc*(jj+1)] + alpha * c_11;
+		// lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = 0.0;
+			c_10 = 0.0;
+			c_01 = 0.0;
+			c_11 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+1+ldd*(jj+0)] = beta * pC[ii+1+ldc*(jj+0)] + alpha * c_10;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			pD[ii+1+ldd*(jj+1)] = beta * pC[ii+1+ldc*(jj+1)] + alpha * c_11;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			c_01 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			pD[ii+0+ldd*(jj+0)] = beta * pC[ii+0+ldc*(jj+0)] + alpha * c_00;
+			pD[ii+0+ldd*(jj+1)] = beta * pC[ii+0+ldc*(jj+1)] + alpha * c_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// diagonal
+		c_00 = 0.0;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		pD[jj+ldd*jj] = beta * pC[jj+ldc*jj] + alpha * c_00;
+		// lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = 0.0;
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			pD[ii+ldd*jj] = beta * pC[ii+ldc*jj] + alpha * c_00;
+			}
+		}
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dgemm nt
+void GEMM_NT_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cn = 'n';
+	char ct = 't';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dgemm nn
+void GEMM_NN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cn = 'n';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &cn, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GEMM(&cn, &cn, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_lower_nottransposed_unit
+void TRSM_LLNU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cl, &cn, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*sD->m, &i1);
+		}
+	TRSM(&cl, &cl, &cn, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_left_upper_nottransposed_notunit
+void TRSM_LUNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cu, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cl, &cu, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_unit
+void TRSM_RLTU_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cu, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cu, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_lower_transposed_notunit
+void TRSM_RLTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cl, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrsm_right_upper_transposed_notunit
+void TRSM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRSM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrmm_right_upper_transposed_notunit (A triangular !!!)
+void TRMM_RUTN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cu, &ct, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cu, &ct, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dtrmm_right_lower_nottransposed_notunit (A triangular !!!)
+void TRMM_RLNN_LIBSTR(int m, int n, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA+ai+aj*sA->m;
+	REAL *pB = sB->pA+bi+bj*sB->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cl, &cn, &cn, &mm, &nn, &alpha, pA, &lda, pD, &ldd);
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldd = sD->m;
+	if(!(pB==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pB+jj*ldb, &i1, pD+jj*ldd, &i1);
+		}
+	TRMM(&cr, &cl, &cn, &cn, &m, &n, &alpha, pA, &lda, pD, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_LIBSTR(int m, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long kk = k;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &mm, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &mm, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#else
+	int i1 = 1;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &m, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &m, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#endif
+	return;
+	}
+
+// dsyrk_lower_nortransposed (allowing for different factors => use dgemm !!!)
+void SYRK_LN_MN_LIBSTR(int m, int n, int k, REAL alpha, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, REAL beta, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long mmn = mm-nn;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &nn, &kk, &alpha, pA, &lda, &beta, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &nn, &kk, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &nn, &kk, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(beta==0.0 || pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*sC->m, &i1, pD+jj*sD->m, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &n, &k, &alpha, pA, &lda, &beta, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &n, &k, &alpha, pA+n, &lda, pB, &ldb, &beta, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &n, &k, &alpha, pA, &lda, pB, &ldb, &beta, pD, &ldd);
+		}
+#endif
+	return;
+	}
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
+
diff --git a/blas/x_lapack_lib.c b/blas/x_lapack_lib.c
new file mode 100644
index 0000000..762a8a0
--- /dev/null
+++ b/blas/x_lapack_lib.c
@@ -0,0 +1,2112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(LA_REFERENCE)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<m-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<m; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+//		for(ii=jj+1; ii<m; ii++)
+//			{
+//			c_00 = pC[ii+ldc*jj];
+//			for(kk=0; kk<jj; kk++)
+//				{
+//				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+//				}
+//			pD[ii+ldd*jj] = c_00 * f_00_inv;
+//			}
+		}
+	return;
+	}
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = pC[ii+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+				}
+			pD[ii+ldd*jj] = c_00 * f_00_inv;
+			}
+		}
+	return;
+	}
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int ii, jj, kk;
+	REAL
+		f_00_inv, 
+		f_10, f_11_inv,
+		c_00, c_01,
+		c_10, c_11;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA + ai + aj*lda;
+	REAL *pB = sB->pA + bi + bj*ldb;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+0+ldc*(jj+0)];;
+		c_10 = pC[jj+1+ldc*(jj+0)];;
+		c_11 = pC[jj+1+ldc*(jj+1)];;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+0+lda*kk] * pB[jj+0+ldb*kk];
+			c_10 += pA[jj+1+lda*kk] * pB[jj+0+ldb*kk];
+			c_11 += pA[jj+1+lda*kk] * pB[jj+1+ldb*kk];
+			}
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+0+ldd*kk] * pD[jj+0+ldd*kk];
+			c_10 -= pD[jj+1+ldd*kk] * pD[jj+0+ldd*kk];
+			c_11 -= pD[jj+1+ldd*kk] * pD[jj+1+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj+0] = f_00_inv;
+		pD[jj+0+ldd*(jj+0)] = c_00 * f_00_inv;
+		f_10 = c_10 * f_00_inv;
+		pD[jj+1+ldd*(jj+0)] = f_10;
+		c_11 -= f_10 * f_10;
+		if(c_11>0)
+			{
+			f_11_inv = 1.0/sqrt(c_11);
+			}
+		else
+			{
+			f_11_inv = 0.0;
+			}
+		dD[jj+1] = f_11_inv;
+		pD[jj+1+ldd*(jj+1)] = c_11 * f_11_inv;
+		// solve lower
+		ii = jj+2;
+		for(; ii<m-1; ii+=2)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_10 = pC[ii+1+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			c_11 = pC[ii+1+ldc*(jj+1)];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_10 += pA[ii+1+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				c_11 += pA[ii+1+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_10 -= pD[ii+1+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				c_11 -= pD[ii+1+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			c_10 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			pD[ii+1+ldd*(jj+0)] = c_10;
+			c_01 -= c_00 * f_10;
+			c_11 -= c_10 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			pD[ii+1+ldd*(jj+1)] = c_11 * f_11_inv;
+			}
+		for(; ii<m; ii++)
+			{
+			c_00 = pC[ii+0+ldc*(jj+0)];
+			c_01 = pC[ii+0+ldc*(jj+1)];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+0+lda*kk] * pB[jj+0+ldb*kk];
+				c_01 += pA[ii+0+lda*kk] * pB[jj+1+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+0+ldd*kk] * pD[jj+0+ldd*kk];
+				c_01 -= pD[ii+0+ldd*kk] * pD[jj+1+ldd*kk];
+				}
+			c_00 *= f_00_inv;
+			pD[ii+0+ldd*(jj+0)] = c_00;
+			c_01 -= c_00 * f_10;
+			pD[ii+0+ldd*(jj+1)] = c_01 * f_11_inv;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// factorize diagonal
+		c_00 = pC[jj+ldc*jj];;
+		for(kk=0; kk<k; kk++)
+			{
+			c_00 += pA[jj+lda*kk] * pB[jj+ldb*kk];
+			}
+		for(kk=0; kk<jj; kk++)
+			{
+			c_00 -= pD[jj+ldd*kk] * pD[jj+ldd*kk];
+			}
+		if(c_00>0)
+			{
+			f_00_inv = 1.0/sqrt(c_00);
+			}
+		else
+			{
+			f_00_inv = 0.0;
+			}
+		dD[jj] = f_00_inv;
+		pD[jj+ldd*jj] = c_00 * f_00_inv;
+		// solve lower
+		for(ii=jj+1; ii<m; ii++)
+			{
+			c_00 = pC[ii+ldc*jj];
+			for(kk=0; kk<k; kk++)
+				{
+				c_00 += pA[ii+lda*kk] * pB[jj+ldb*kk];
+				}
+			for(kk=0; kk<jj; kk++)
+				{
+				c_00 -= pD[ii+ldd*kk] * pD[jj+ldd*kk];
+				}
+			pD[ii+ldd*jj] = c_00 * f_00_inv;
+			}
+		}
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+void GETF2_NOPIVOT(int m, int n, REAL *A, int lda, REAL *dA)
+	{
+	int ii, jj, kk, itmp0, itmp1;
+	int iimax = m<n ? m : n;
+	int i1 = 1;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+
+	for(ii=0; ii<iimax; ii++)
+		{
+		itmp0 = m-ii-1;
+		dtmp = 1.0/A[ii+lda*ii];
+		dA[ii] = dtmp;
+		for(jj=0; jj<itmp0; jj++)
+			{
+			A[ii+1+jj+lda*ii] *= dtmp;
+			}
+		itmp1 = n-ii-1;
+		for(jj=0; jj<itmp1; jj++)
+			{
+			for(kk=0; kk<itmp0; kk++)
+				{
+				A[(ii+1+kk)+lda*(ii+1+jj)] -= A[(ii+1+kk)+lda*ii] * A[ii+lda*(ii+1+jj)];
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	int ii, jj, kk;
+//	int i1 = 1;
+//	REAL d1 = 1.0;
+	REAL
+		d_00_inv, d_11_inv,
+		d_00, d_01,
+		d_10, d_11;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA + ci + cj*ldc;
+	REAL *pD = sD->pA + di + dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		// upper
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// diagonal
+		ii = jj;
+		if(ii<m-1)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			d_10 *= d_00_inv;
+			d_11 -= d_10 * d_01;
+			d_11_inv = 1.0/d_11;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			dD[ii+0] = d_00_inv;
+			dD[ii+1] = d_11_inv;
+			ii += 2;
+			}
+		else if(ii<m)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			dD[ii+0] = d_00_inv;
+			ii += 1;
+			}
+		// lower
+		for(; ii<m-1; ii+=2)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_10 = pC[(ii+1)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			d_11 = pC[(ii+1)+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+			d_11 -= d_10 * pD[kk+ldd*(jj+1)];
+			d_01 *= d_11_inv;
+			d_11 *= d_11_inv;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*(jj+0)];
+			d_01 = pC[(ii+0)+ldc*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_01 -= d_00 * pD[kk+ldd*(jj+1)];
+			d_01 *= d_11_inv;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		// upper
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		// diagonal
+		ii = jj;
+		if(ii<m-1)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			d_10 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			dD[ii+0] = d_00_inv;
+			ii += 2;
+			}
+		else if(ii<m)
+			{
+			// correct diagonal
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// factorize diagonal
+			d_00_inv = 1.0/d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			dD[ii+0] = d_00_inv;
+			ii += 1;
+			}
+		// lower
+		for(; ii<m-1; ii+=2)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*jj];
+			d_10 = pC[(ii+1)+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pC[(ii+0)+ldc*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve lower
+			d_00 *= d_00_inv;
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		}
+#else
+	if(pC!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pC[ii+ldc*jj];
+				}
+			}
+		}
+	GETF2_NOPIVOT(m, n, pD, ldd, dD);
+#endif
+	return;
+	}
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+	{
+	int ii, i0, jj, kk, ip, itmp0, itmp1;
+	REAL dtmp, dmax;
+	REAL
+		d_00_inv, d_11_inv,
+		d_00, d_01,
+		d_10, d_11;
+	int i1 = 1;
+	REAL d1 = 1.0;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	REAL *pC = sC->pA+ci+cj*ldc;
+	REAL *pD = sD->pA+di+dj*ldd;
+	REAL *dD = sD->dA;
+	if(di==0 & dj==0)
+		sD->use_dA = 1;
+	else
+		sD->use_dA = 0;
+	// copy if needed
+	if(pC!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pC[ii+ldc*jj];
+				}
+			}
+		}
+	// factorize
+#if 1
+	jj = 0;
+	for(; jj<n-1; jj+=2)
+		{
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			d_11 -= pD[(ii+1)+ldd*kk] * d_01;
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			// solve upper
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// correct diagonal and lower and look for pivot
+		// correct
+		ii = jj;
+		i0 = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				d_11 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+0)];
+				d_01 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*(jj+1)];
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// look for pivot & solve
+		// left column
+		ii = i0;
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_10>0.0 ? d_10 : -d_10;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			}
+		// row swap
+		ii = i0;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[(ii+0)+ldd*(jj+0)];
+		d_00_inv = 1.0/d_00;
+		pD[(ii+0)+ldd*(jj+0)] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		// solve & compute next pivot
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_10 = pD[(ii+1)+ldd*(jj+0)];
+			d_00 *= d_00_inv;
+			d_10 *= d_00_inv;
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_11 = pD[(ii+1)+ldd*(jj+1)];
+			d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+			d_11 -= d_10 * pD[jj+ldd*(jj+1)];
+			dtmp = d_01>0.0 ? d_01 : -d_01;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_11>0.0 ? d_11 : -d_11;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+1)+ldd*(jj+0)] = d_10;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			pD[(ii+1)+ldd*(jj+1)] = d_11;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*(jj+0)];
+			d_00 *= d_00_inv;
+			d_01 = pD[(ii+0)+ldd*(jj+1)];
+			d_01 -= d_00 * pD[jj+ldd*(jj+1)];
+			dtmp = d_01>0.0 ? d_01 : -d_01;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			pD[(ii+0)+ldd*(jj+0)] = d_00;
+			pD[(ii+0)+ldd*(jj+1)] = d_01;
+			}
+		// row swap
+		ii = i0+1;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[ii+ldd*(jj+1)];
+		d_00_inv = 1.0/d_00;
+		pD[ii+ldd*(jj+1)] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		// solve lower
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[ii+ldd*(jj+1)];
+			d_00 *= d_00_inv;
+			pD[ii+ldd*(jj+1)] = d_00;
+			}
+		}
+	for(; jj<n; jj++)
+		{
+		ii = 0;
+		for(; ii<jj-1; ii+=2)
+			{
+			// correct upper
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			d_10 -= pD[(ii+1)+ldd*kk] * d_00;
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<jj; ii++)
+			{
+			// correct upper
+			d_00 = pD[ii+ldd*jj];
+			for(kk=0; kk<ii; kk++)
+				{
+				d_00 -= pD[ii+ldd*kk] * pD[kk+ldd*jj];
+				}
+			// solve upper
+			pD[ii+ldd*jj] = d_00;
+			}
+		i0 = ii;
+		ii = jj;
+		// correct diagonal and lower and look for pivot
+		dmax = 0;
+		ip = ii;
+		for(; ii<m-1; ii+=2)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			d_10 = pD[(ii+1)+ldd*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				d_10 -= pD[(ii+1)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			dtmp = d_10>0.0 ? d_10 : -d_10;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+1;
+				}
+			pD[(ii+0)+ldd*jj] = d_00;
+			pD[(ii+1)+ldd*jj] = d_10;
+			}
+		for(; ii<m; ii++)
+			{
+			d_00 = pD[(ii+0)+ldd*jj];
+			for(kk=0; kk<jj; kk++)
+				{
+				d_00 -= pD[(ii+0)+ldd*kk] * pD[kk+ldd*jj];
+				}
+			dtmp = d_00>0.0 ? d_00 : -d_00;
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+0;
+				}
+			pD[(ii+0)+ldd*jj] = d_00;
+			}
+		// row swap
+		ii = i0;
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(kk=0; kk<n; kk++)
+				{
+				dtmp = pD[ii+ldd*kk];
+				pD[ii+ldd*kk] = pD[ip+ldd*kk];
+				pD[ip+ldd*kk] = dtmp;
+				}
+			}
+		// factorize diagonal
+		d_00 = pD[ii+ldd*jj];
+		d_00_inv = 1.0/d_00;
+		pD[ii+ldd*jj] = d_00;
+		dD[ii] = d_00_inv;
+		ii += 1;
+		for(; ii<m; ii++)
+			{
+			// correct lower
+			d_00 = pD[ii+ldd*jj];
+			// solve lower
+			d_00 *= d_00_inv;
+			pD[ii+ldd*jj] = d_00;
+			}
+		}
+#else
+	int iimax = m<n ? m : n;
+	for(ii=0; ii<iimax; ii++)
+		{
+		dmax = (pD[ii+ldd*ii]>0 ? pD[ii+ldd*ii] : -pD[ii+ldd*ii]);
+		ip = ii;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			dtmp = pD[ii+jj+ldd*ii]>0 ? pD[ii+jj+ldd*ii] : -pD[ii+jj+ldd*ii];
+			if(dtmp>dmax)
+				{
+				dmax = dtmp;
+				ip = ii+jj;
+				}
+			}
+		ipiv[ii] = ip;
+		if(ip!=ii)
+			{
+			for(jj=0; jj<n; jj++)
+				{
+				dtmp = pD[ii+jj*ldd];
+				pD[ii+jj*ldd] = pD[ip+jj*ldd];
+				pD[ip+jj*ldd] = dtmp;
+				}
+			}
+		itmp0 = m-ii-1;
+		dtmp = 1.0/pD[ii+ldd*ii];
+		dD[ii] = dtmp;
+		for(jj=0; jj<itmp0; jj++)
+			{
+			pD[ii+1+jj+ldd*ii] *= dtmp;
+			}
+		itmp1 = n-ii-1;
+		for(jj=0; jj<itmp1; jj++)
+			{
+			for(kk=0; kk<itmp0; kk++)
+				{
+				pD[(ii+1+kk)+ldd*(ii+1+jj)] -= pD[(ii+1+kk)+ldd*ii] * pD[ii+ldd*(ii+1+jj)];
+				}
+			}
+		}
+#endif
+	return;	
+	}
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	return 0;
+	}
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA+ai+aj*lda;
+	REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+	REAL *dD = sD->dA+di; // vectors of tau
+	REAL alpha, beta, tmp, w0, w1;
+	REAL *pC00, *pC01, *pC11, *pv0, *pv1;
+	REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldw = 2;
+	REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldb = 2;
+	int imax, jmax, kmax;
+	// copy if needed
+	if(pA!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pA[ii+lda*jj];
+				}
+			}
+		}
+	imax = m<n ? m : n;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first column
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			tmp = pC00[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau0
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau0
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v0
+			pC00[0+ldd*0] = beta;
+			for(jj=1; jj<m-ii; jj++)
+				{
+				pC00[jj+ldd*0] *= tmp;
+				}
+			}
+		// gemv_t & ger
+		pC01 = &pC00[0+ldd*1];
+		pv0 = &pC00[0+ldd*0];
+		kmax = m-ii;
+		w0 = pC01[0+ldd*0]; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w0 += pC01[kk+ldd*0] * pv0[kk];
+			}
+		w0 = - dD[ii] * w0;
+		pC01[0+ldd*0] += w0; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC01[kk+ldd*0] += w0 * pv0[kk];
+			}
+		// second column
+		pC11 = &pD[(ii+1)+ldd*(ii+1)];
+		beta = 0.0;
+		for(jj=1; jj<m-(ii+1); jj++)
+			{
+			tmp = pC11[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau1
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau1
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v1
+			pC11[0+ldd*0] = beta;
+			for(jj=1; jj<m-(ii+1); jj++)
+				pC11[jj+ldd*0] *= tmp;
+			}
+		// compute lower triangular T containing tau for matrix update
+		pv0 = &pC00[0+ldd*0];
+		pv1 = &pC00[0+ldd*1];
+		kmax = m-ii;
+		tmp = pv0[1];
+		for(kk=2; kk<kmax; kk++)
+			tmp += pv0[kk]*pv1[kk];
+		pT[0+ldb*0] = dD[ii+0];
+		pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldb*1] = dD[ii+1];
+		jmax = n-ii-2;
+		jj = 0;
+		for(; jj<jmax-1; jj+=2)
+			{
+			// compute W^T = C^T * V
+			pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+			pW[1+ldw*0] = pC00[0+ldd*(jj+1+2)] + pC00[1+ldd*(jj+1+2)] * pv0[1];
+			pW[0+ldw*1] =                        pC00[1+ldd*(jj+0+2)];
+			pW[1+ldw*1] =                        pC00[1+ldd*(jj+1+2)];
+			kk = 2;
+			for(; kk<kmax; kk++)
+				{
+				tmp = pC00[kk+ldd*(jj+0+2)];
+				pW[0+ldw*0] += tmp * pv0[kk];
+				pW[0+ldw*1] += tmp * pv1[kk];
+				tmp = pC00[kk+ldd*(jj+1+2)];
+				pW[1+ldw*0] += tmp * pv0[kk];
+				pW[1+ldw*1] += tmp * pv1[kk];
+				}
+			// compute W^T *= T
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+			// compute C -= V * W^T
+			pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+			pC00[0+ldd*(jj+1+2)] -= pW[1+ldw*0];
+			pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC00[1+ldd*(jj+1+2)] -= pv0[1]*pW[1+ldw*0] + pW[1+ldw*1];
+			kk = 2;
+			for(; kk<kmax-1; kk+=2)
+				{
+				pC00[kk+0+ldd*(jj+0+2)] -= pv0[kk+0]*pW[0+ldw*0] + pv1[kk+0]*pW[0+ldw*1];
+				pC00[kk+1+ldd*(jj+0+2)] -= pv0[kk+1]*pW[0+ldw*0] + pv1[kk+1]*pW[0+ldw*1];
+				pC00[kk+0+ldd*(jj+1+2)] -= pv0[kk+0]*pW[1+ldw*0] + pv1[kk+0]*pW[1+ldw*1];
+				pC00[kk+1+ldd*(jj+1+2)] -= pv0[kk+1]*pW[1+ldw*0] + pv1[kk+1]*pW[1+ldw*1];
+				}
+			for(; kk<kmax; kk++)
+				{
+				pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+				pC00[kk+ldd*(jj+1+2)] -= pv0[kk]*pW[1+ldw*0] + pv1[kk]*pW[1+ldw*1];
+				}
+			}
+		for(; jj<jmax; jj++)
+			{
+			// compute W = T * V^T * C
+			pW[0+ldw*0] = pC00[0+ldd*(jj+0+2)] + pC00[1+ldd*(jj+0+2)] * pv0[1];
+			pW[0+ldw*1] =                        pC00[1+ldd*(jj+0+2)];
+			for(kk=2; kk<kmax; kk++)
+				{
+				tmp = pC00[kk+ldd*(jj+0+2)];
+				pW[0+ldw*0] += tmp * pv0[kk];
+				pW[0+ldw*1] += tmp * pv1[kk];
+				}
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			// compute C -= V * W^T
+			pC00[0+ldd*(jj+0+2)] -= pW[0+ldw*0];
+			pC00[1+ldd*(jj+0+2)] -= pv0[1]*pW[0+ldw*0] + pW[0+ldw*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC00[kk+ldd*(jj+0+2)] -= pv0[kk]*pW[0+ldw*0] + pv1[kk]*pW[0+ldw*1];
+				}
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<m-ii; jj++)
+			{
+			tmp = pC00[jj+ldd*0];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			for(jj=1; jj<m-ii; jj++)
+				pC00[jj+ldd*0] *= tmp;
+			pC00[0+ldd*0] = beta;
+			}
+		if(ii<n)
+			{
+			// gemv_t & ger
+			pC01 = &pC00[0+ldd*1];
+			pv0 = &pC00[0+ldd*0];
+			jmax = n-ii-1;
+			kmax = m-ii;
+			jj = 0;
+			for(; jj<jmax-1; jj+=2)
+				{
+				w0 = pC01[0+ldd*(jj+0)]; // pv0[0] = 1.0
+				w1 = pC01[0+ldd*(jj+1)]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC01[kk+ldd*(jj+0)] * pv0[kk];
+					w1 += pC01[kk+ldd*(jj+1)] * pv0[kk];
+					}
+				w0 = - dD[ii] * w0;
+				w1 = - dD[ii] * w1;
+				pC01[0+ldd*(jj+0)] += w0; // pv0[0] = 1.0
+				pC01[0+ldd*(jj+1)] += w1; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC01[kk+ldd*(jj+0)] += w0 * pv0[kk];
+					pC01[kk+ldd*(jj+1)] += w1 * pv0[kk];
+					}
+				}
+			for(; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ldd*jj]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC01[kk+ldd*jj] * pv0[kk];
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ldd*jj] += w0; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC01[kk+ldd*jj] += w0 * pv0[kk];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	return 0;
+	}
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sA, int ai, int aj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk;
+	int lda = sA->m;
+	int ldd = sD->m;
+	REAL *pA = sA->pA+ai+aj*lda;
+	REAL *pD = sD->pA+di+dj*ldd; // matrix of QR
+	REAL *dD = sD->dA+di; // vectors of tau
+	REAL alpha, beta, tmp, w0, w1;
+	REAL *pC00, *pC10, *pC11, *pv0, *pv1;
+	REAL pW[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldw = 2;
+	REAL pT[4] = {0.0, 0.0, 0.0, 0.0};
+	int ldb = 2;
+	int imax, jmax, kmax;
+	// copy if needed
+	if(pA!=pD)
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			for(ii=0; ii<m; ii++)
+				{
+				pD[ii+ldd*jj] = pA[ii+lda*jj];
+				}
+			}
+		}
+	imax = m<n ? m : n;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first column
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau0
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau0
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v0
+			pC00[0+ldd*0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				{
+				pC00[0+ldd*jj] *= tmp;
+				}
+			}
+		// gemv_t & ger
+		pC10 = &pC00[1+ldd*0];
+		pv0 = &pC00[0+ldd*0];
+		kmax = n-ii;
+		w0 = pC10[0+ldd*0]; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w0 += pC10[0+ldd*kk] * pv0[0+ldd*kk];
+			}
+		w0 = - dD[ii] * w0;
+		pC10[0+ldd*0] += w0; // pv0[0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ldd*kk] += w0 * pv0[0+ldd*kk];
+			}
+		// second row
+		pC11 = &pD[(ii+1)+ldd*(ii+1)];
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			// tau1
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			// tau1
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			// compute v1
+			pC11[0+ldd*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ldd*jj] *= tmp;
+			}
+		// compute lower triangular T containing tau for matrix update
+		pv0 = &pC00[0+ldd*0];
+		pv1 = &pC00[1+ldd*0];
+		kmax = n-ii;
+		tmp = pv0[0+ldd*1];
+		for(kk=2; kk<kmax; kk++)
+			tmp += pv0[0+ldd*kk]*pv1[0+ldd*kk];
+		pT[0+ldb*0] = dD[ii+0];
+		pT[1+ldb*0] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldb*1] = dD[ii+1];
+		// downgrade
+		jmax = m-ii-2;
+		jj = 0;
+		for(; jj<jmax-1; jj+=2)
+			{
+			// compute W^T = C^T * V
+			pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+			pW[1+ldw*0] = pC00[jj+1+2+ldd*0] + pC00[jj+1+2+ldd*1] * pv0[0+ldd*1];
+			pW[0+ldw*1] =                      pC00[jj+0+2+ldd*1];
+			pW[1+ldw*1] =                      pC00[jj+1+2+ldd*1];
+			kk = 2;
+			for(; kk<kmax; kk++)
+				{
+				tmp = pC00[jj+0+2+ldd*kk];
+				pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+				tmp = pC00[jj+1+2+ldd*kk];
+				pW[1+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[1+ldw*1] += tmp * pv1[0+ldd*kk];
+				}
+			// compute W^T *= T
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[1+ldw*1] = pT[1+ldb*0]*pW[1+ldw*0] + pT[1+ldb*1]*pW[1+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			pW[1+ldw*0] = pT[0+ldb*0]*pW[1+ldw*0];
+			// compute C -= V * W^T
+			pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+			pC00[jj+1+2+ldd*0] -= pW[1+ldw*0];
+			pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC00[jj+1+2+ldd*1] -= pv0[0+ldd*1]*pW[1+ldw*0] + pW[1+ldw*1];
+			kk = 2;
+			for(; kk<kmax-1; kk+=2)
+				{
+				pC00[jj+0+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[0+ldw*0] + pv1[0+ldd*(kk+0)]*pW[0+ldw*1];
+				pC00[jj+0+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[0+ldw*0] + pv1[0+ldd*(kk+1)]*pW[0+ldw*1];
+				pC00[jj+1+2+ldd*(kk+0)] -= pv0[0+ldd*(kk+0)]*pW[1+ldw*0] + pv1[0+ldd*(kk+0)]*pW[1+ldw*1];
+				pC00[jj+1+2+ldd*(kk+1)] -= pv0[0+ldd*(kk+1)]*pW[1+ldw*0] + pv1[0+ldd*(kk+1)]*pW[1+ldw*1];
+				}
+			for(; kk<kmax; kk++)
+				{
+				pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+				pC00[jj+1+2+ldd*kk] -= pv0[0+ldd*kk]*pW[1+ldw*0] + pv1[0+ldd*kk]*pW[1+ldw*1];
+				}
+			}
+		for(; jj<jmax; jj++)
+			{
+			// compute W = T * V^T * C
+			pW[0+ldw*0] = pC00[jj+0+2+ldd*0] + pC00[jj+0+2+ldd*1] * pv0[0+ldd*1];
+			pW[0+ldw*1] =                      pC00[jj+0+2+ldd*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				tmp = pC00[jj+0+2+ldd*kk];
+				pW[0+ldw*0] += tmp * pv0[0+ldd*kk];
+				pW[0+ldw*1] += tmp * pv1[0+ldd*kk];
+				}
+			pW[0+ldw*1] = pT[1+ldb*0]*pW[0+ldw*0] + pT[1+ldb*1]*pW[0+ldw*1];
+			pW[0+ldw*0] = pT[0+ldb*0]*pW[0+ldw*0];
+			// compute C -= V * W^T
+			pC00[jj+0+2+ldd*0] -= pW[0+ldw*0];
+			pC00[jj+0+2+ldd*1] -= pv0[0+ldd*1]*pW[0+ldw*0] + pW[0+ldw*1];
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC00[jj+0+2+ldd*kk] -= pv0[0+ldd*kk]*pW[0+ldw*0] + pv1[0+ldd*kk]*pW[0+ldw*1];
+				}
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD[ii+ldd*ii];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ldd*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0+ldd*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ldd*jj] *= tmp;
+			pC00[0+ldd*0] = beta;
+			}
+		if(ii<n)
+			{
+			// gemv_t & ger
+			pC10 = &pC00[1+ldd*0];
+			pv0 = &pC00[0+ldd*0];
+			jmax = m-ii-1;
+			kmax = n-ii;
+			jj = 0;
+			for(; jj<jmax-1; jj+=2)
+				{
+				w0 = pC10[jj+0+ldd*0]; // pv0[0] = 1.0
+				w1 = pC10[jj+1+ldd*0]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC10[jj+0+ldd*kk] * pv0[0+ldd*kk];
+					w1 += pC10[jj+1+ldd*kk] * pv0[0+ldd*kk];
+					}
+				w0 = - dD[ii] * w0;
+				w1 = - dD[ii] * w1;
+				pC10[jj+0+ldd*0] += w0; // pv0[0] = 1.0
+				pC10[jj+1+ldd*0] += w1; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[jj+0+ldd*kk] += w0 * pv0[0+ldd*kk];
+					pC10[jj+1+ldd*kk] += w1 * pv0[0+ldd*kk];
+					}
+				}
+			for(; jj<jmax; jj++)
+				{
+				w0 = pC10[jj+ldd*0]; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					w0 += pC10[jj+ldd*kk] * pv0[0+ldd*kk];
+					}
+				w0 = - dD[ii] * w0;
+				pC10[jj+ldd*0] += w0; // pv0[0] = 1.0
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[jj+ldd*kk] += w0 * pv0[0+ldd*kk];
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+#elif defined(LA_BLAS)
+
+
+
+// dpotrf
+void POTRF_L_LIBSTR(int m, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long info;
+	long long tmp;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &mm, pD, &ldd, &info);
+#else
+	int i1 = 1;
+	int info;
+	int tmp;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<m; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &m, pD, &ldd, &info);
+#endif
+	return;
+	}
+
+
+
+// dpotrf
+void POTRF_L_MN_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long mmn = mm-nn;
+	long long info;
+	long long tmp;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &nn, pD, &ldd, &info);
+	TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int info;
+	int tmp;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			{
+			tmp = m-jj;
+			COPY(&tmp, pC+jj*ldc+jj, &i1, pD+jj*ldd+jj, &i1);
+			}
+		}
+	POTRF(&cl, &n, pD, &ldd, &info);
+	TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+#endif
+	return;
+	}
+
+
+
+// dsyrk dpotrf
+void SYRK_POTRF_LN_LIBSTR(int m, int n, int k, struct STRMAT *sA, int ai, int aj, struct STRMAT *sB, int bi, int bj, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	char cl = 'l';
+	char cn = 'n';
+	char cr = 'r';
+	char ct = 't';
+	char cu = 'u';
+	REAL d1 = 1.0;
+	REAL *pA = sA->pA + ai + aj*sA->m;
+	REAL *pB = sB->pA + bi + bj*sB->m;
+	REAL *pC = sC->pA + ci + cj*sC->m;
+	REAL *pD = sD->pA + di + dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long kk = k;
+	long long mmn = mm-nn;
+	long long info;
+	long long lda = sA->m;
+	long long ldb = sB->m;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &nn, &kk, &d1, pA, &lda, &d1, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &nn, &kk, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+		POTRF(&cl, &nn, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &mm, &nn, &kk, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+		POTRF(&cl, &nn, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &nn, &d1, pD, &ldd, pD+n, &ldd);
+		}
+#else
+	int i1 = 1;
+	int mmn = m-n;
+	int info;
+	int lda = sA->m;
+	int ldb = sB->m;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	if(pA==pB)
+		{
+		SYRK(&cl, &cn, &n, &k, &d1, pA, &lda, &d1, pD, &ldd);
+		GEMM(&cn, &ct, &mmn, &n, &k, &d1, pA+n, &lda, pB, &ldb, &d1, pD+n, &ldd);
+		POTRF(&cl, &n, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+		}
+	else
+		{
+		GEMM(&cn, &ct, &m, &n, &k, &d1, pA, &lda, pB, &ldb, &d1, pD, &ldd);
+		POTRF(&cl, &n, pD, &ldd, &info);
+		TRSM(&cr, &cl, &ct, &cn, &mmn, &n, &d1, pD, &ldd, pD+n, &ldd);
+		}
+#endif
+	return;
+	}
+
+
+
+// dgetrf without pivoting
+#if defined(REF_BLAS_BLIS)
+static void GETF2_NOPIVOT(long long m, long long n, REAL *A, long long lda)
+	{
+	if(m<=0 | n<=0)
+		return;
+	long long i, j;
+	long long jmax = m<n ? m : n;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+	long long itmp0, itmp1;
+	long long i1 = 1;
+	for(j=0; j<jmax; j++)
+		{
+		itmp0 = m-j-1;
+		dtmp = 1.0/A[j+lda*j];
+		SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+		itmp1 = n-j-1;
+		GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+		}
+	return;
+	}
+#else
+static void GETF2_NOPIVOT(int m, int n, REAL *A, int lda)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int i, j;
+	int jmax = m<n ? m : n;
+	REAL dtmp;
+	REAL dm1 = -1.0;
+	int itmp0, itmp1;
+	int i1 = 1;
+	for(j=0; j<jmax; j++)
+		{
+		itmp0 = m-j-1;
+		dtmp = 1.0/A[j+lda*j];
+		SCAL(&itmp0, &dtmp, &A[(j+1)+lda*j], &i1);
+		itmp1 = n-j-1;
+		GER(&itmp0, &itmp1, &dm1, &A[(j+1)+lda*j], &i1, &A[j+lda*(j+1)], &lda, &A[(j+1)+lda*(j+1)], &lda);
+		}
+	return;
+	}
+#endif
+
+
+
+// dgetrf without pivoting
+void GETRF_NOPIVOT_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj)
+	{
+	// TODO with custom level 2 LAPACK + level 3 BLAS
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETF2_NOPIVOT(mm, nn, pD, ldd);
+#else
+	int i1 = 1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETF2_NOPIVOT(m, n, pD, ldd);
+#endif
+	return;
+	}
+
+
+
+// dgetrf pivoting
+void GETRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, int *ipiv)
+	{
+	// TODO with custom level 2 LAPACK + level 3 BLAS
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	int tmp = m<n ? m : n;
+	REAL d1 = 1.0;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETRF(&mm, &nn, pD, &ldd, (long long *) ipiv, &info);
+	for(jj=0; jj<tmp; jj++)
+		ipiv[jj] -= 1;
+#else
+	int i1 = 1;
+	int info;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+	GETRF(&m, &n, pD, &ldd, ipiv, &info);
+	for(jj=0; jj<tmp; jj++)
+		ipiv[jj] -= 1;
+#endif
+	return;
+	}
+
+
+
+int GEQRF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	REAL dwork;
+	REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+	long long mm = m;
+	long long nn = n;
+	long long lwork = -1;
+	long long info;
+	long long ldd = mm;
+	GEQRF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+	int lwork = -1;
+	int info;
+	int ldd = m;
+	GEQRF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+	int size = dwork;
+	return size*sizeof(REAL);
+	}
+
+
+
+void GEQRF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	REAL *dD = sD->dA+di;
+	REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info = -1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+	long long lwork = -1;
+	GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GEQRF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+	int i1 = 1;
+	int info = -1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+	int lwork = -1;
+	GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GEQRF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+	return;
+	}
+
+
+
+int GELQF_WORK_SIZE_LIBSTR(int m, int n)
+	{
+	REAL dwork;
+	REAL *pD, *dD;
+#if defined(REF_BLAS_BLIS)
+	long long mm = m;
+	long long nn = n;
+	long long lwork = -1;
+	long long info;
+	long long ldd = mm;
+	GELQF(&mm, &nn, pD, &ldd, dD, &dwork, &lwork, &info);
+#else
+	int lwork = -1;
+	int info;
+	int ldd = m;
+	GELQF(&m, &n, pD, &ldd, dD, &dwork, &lwork, &info);
+#endif
+	int size = dwork;
+	return size*sizeof(REAL);
+	}
+
+
+
+void GELQF_LIBSTR(int m, int n, struct STRMAT *sC, int ci, int cj, struct STRMAT *sD, int di, int dj, void *work)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int jj;
+	REAL *pC = sC->pA+ci+cj*sC->m;
+	REAL *pD = sD->pA+di+dj*sD->m;
+	REAL *dD = sD->dA+di;
+	REAL *dwork = (REAL *) work;
+#if defined(REF_BLAS_BLIS)
+	long long i1 = 1;
+	long long info = -1;
+	long long mm = m;
+	long long nn = n;
+	long long ldc = sC->m;
+	long long ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&mm, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&mm, &nn, pD, &ldd, dD, dwork, &info);
+	long long lwork = -1;
+	GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GELQF(&mm, &nn, pD, &ldd, dD, dwork, &lwork, &info);
+#else
+	int i1 = 1;
+	int info = -1;
+	int ldc = sC->m;
+	int ldd = sD->m;
+	if(!(pC==pD))
+		{
+		for(jj=0; jj<n; jj++)
+			COPY(&m, pC+jj*ldc, &i1, pD+jj*ldd, &i1);
+		}
+//	GEQR2(&m, &n, pD, &ldd, dD, dwork, &info);
+	int lwork = -1;
+	GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+	lwork = dwork[0];
+	GELQF(&m, &n, pD, &ldd, dD, dwork, &lwork, &info);
+#endif
+	return;
+	}
+
+
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+
+
+
diff --git a/blasfeo_target.h.in b/blasfeo_target.h.in
new file mode 100644
index 0000000..a98ac81
--- /dev/null
+++ b/blasfeo_target.h.in
@@ -0,0 +1,11 @@
+#ifndef TARGET_@TARGET@
+#define TARGET_@TARGET@
+#endif
+
+#ifndef LA_@LA@
+#define LA_@LA@
+#endif
+
+#ifndef EXT_DEP
+#cmakedefine EXT_DEP @EXT_DEP@
+#endif
diff --git a/doc/guide.pdf b/doc/guide.pdf
new file mode 100644
index 0000000..9f81df3
--- /dev/null
+++ b/doc/guide.pdf
Binary files differ
diff --git a/doc/guide.tex b/doc/guide.tex
new file mode 100644
index 0000000..626eaa4
--- /dev/null
+++ b/doc/guide.tex
@@ -0,0 +1,149 @@
+\documentclass[a4paper]{report}
+
+\usepackage[margin=3.0cm]{geometry}
+\usepackage{amsmath}
+\usepackage[pdftex]{graphicx}
+%\usepackage{graphics}
+\usepackage{subfig}
+
+
+
+\title{BLASFEO reference guide}
+\author{Gianluca Frison}
+
+
+
+\begin{document}
+
+\maketitle
+\tableofcontents
+
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Introduction}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+BLASFEO - BLAS For Embedded Optimization.
+
+
+
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Matrix data type}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+The fundamental data type in BLASFEO is a C struct defining a matrix, called {\tt strmat}.
+Depending on the chosen linear algebra library, the struct is defined differently.
+
+
+
+\section{{\tt strmat} definition}
+
+
+\subsection{BLASFEO}
+
+\begin{verbatim}
+struct d_strmat 
+	{
+	int bs;
+	int m;
+	int n;
+	int pm;
+	int cn;
+	double *pA;
+	double *dA;
+	int use_dA;
+	int memory_size;
+	};
+\end{verbatim}
+where the struct members are
+\begin{description}
+\item[bs] height of the panel
+\item[m] number of rows
+\item[n] number of columns
+\item[pm] number of rows of the matrix as allocated in memory, used for memory alignment
+\item[cn] number of rows of the matrix as allocated in memory, used for memory alignment
+\item[pA] pointer to a pm$\times$pn array of doubles, the first element is aligned to cache line size
+\item[dA] pointer to a min(m,n) array of doubles, used e.g. to store the inverse of the diagonal of the matrix
+\item[use\_dA] flag to tell if dA contains useful information
+\item[memory\_size] size of the memory (in bytes) needed for pA and pD
+\end{description}
+
+
+\subsection{BLAS}
+
+\begin{verbatim}
+struct d_strmat 
+	{
+	int m; // rows
+	int n; // cols
+	double *pA; // pointer to a m*n array of doubles
+	int memory_size; // size of needed memory
+	};
+\end{verbatim}
+\begin{description}
+\item[m] number of rows
+\item[n] number of columns
+\item[pA] pointer to a m$\times$n array of doubles
+\item[memory\_size] size of the memory (in bytes) needed for pA
+\end{description}
+
+
+
+\section{{\tt strmat} management}
+
+\begin{verbatim}
+void d_allocate_strmat(int m, int n, struct d_strmat *sA);
+\end{verbatim}
+
+\begin{verbatim}
+void d_free_strmat(struct d_strmat *sA);
+\end{verbatim}
+
+\begin{verbatim}
+int d_size_strmat(int m, int n);
+\end{verbatim}
+
+\begin{verbatim}
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory);
+\end{verbatim}
+
+
+
+\section{{\tt strmat} conversion}
+
+\begin{verbatim}
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, 
+     int ai, int aj);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, 
+     int ai, int aj);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, 
+     double *A, int lda);
+\end{verbatim}
+
+\begin{verbatim}
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, 
+     double *A, int lda);
+\end{verbatim}
+
+
+
+\section{{\tt strmat} print}
+
+\begin{verbatim}
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+\end{verbatim}
+
+
+
+\end{document}
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..7204cba
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,69 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm 
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = -lblis -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+
+ifneq ($(NUM_THREAD), 1)
+LIBS += -pthread 
+endif
+
+#OBJS_TEST = example_d_lu_factorization.o
+#OBJS_TEST = example_s_lu_factorization.o
+OBJS_TEST = tools.o example_d_riccati_recursion.o
+#OBJS_TEST = tools.o example_s_riccati_recursion.o
+
+all: clean obj run
+
+obj: $(OBJS_TEST)
+	cp ../libblasfeo.a .
+	$(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+run:
+	./test.out
+
+clean:
+	rm -f *.o
+	rm -f test.out
+	rm -f libblasfeo.a
+
diff --git a/examples/example_d_lu_factorization.c b/examples/example_d_lu_factorization.c
new file mode 100644
index 0000000..62b3413
--- /dev/null
+++ b/examples/example_d_lu_factorization.c
@@ -0,0 +1,210 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+	{
+
+	printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+
+	double *A; d_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	d_print_mat(n, n, A, n);
+
+	// spd matrix
+	double *B; d_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	d_print_mat(n, n, B, n);
+
+	// identity
+	double *I; d_zeros(&I, n, n);
+	for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+//	d_print_mat(n, n, B, n);
+
+	// result matrix
+	double *D; d_zeros(&D, n, n);
+//	d_print_mat(n, n, D, n);
+
+	// permutation indeces
+	int *ipiv; int_zeros(&ipiv, n, 1);
+
+	//
+	// matrices in matrix struct format
+	//
+
+	// work space enough for 5 matrix structs for size n times n
+	int size_strmat = 5*d_size_strmat(n, n);
+	void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+	char *ptr_memory_strmat = (char *) memory_strmat;
+
+	struct d_strmat sA;
+//	d_allocate_strmat(n, n, &sA);
+	d_create_strmat(n, n, &sA, ptr_memory_strmat);
+	ptr_memory_strmat += sA.memory_size;
+	// convert from column major matrix to strmat
+	d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	printf("\nA = \n");
+	d_print_strmat(n, n, &sA, 0, 0);
+
+	struct d_strmat sB;
+//	d_allocate_strmat(n, n, &sB);
+	d_create_strmat(n, n, &sB, ptr_memory_strmat);
+	ptr_memory_strmat += sB.memory_size;
+	// convert from column major matrix to strmat
+	d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	printf("\nB = \n");
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	struct d_strmat sI;
+//	d_allocate_strmat(n, n, &sI);
+	d_create_strmat(n, n, &sI, ptr_memory_strmat);
+	ptr_memory_strmat += sI.memory_size;
+	// convert from column major matrix to strmat
+
+	struct d_strmat sD;
+//	d_allocate_strmat(n, n, &sD);
+	d_create_strmat(n, n, &sD, ptr_memory_strmat);
+	ptr_memory_strmat += sD.memory_size;
+
+	struct d_strmat sLU;
+//	d_allocate_strmat(n, n, &sD);
+	d_create_strmat(n, n, &sLU, ptr_memory_strmat);
+	ptr_memory_strmat += sLU.memory_size;
+
+	dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+	printf("\nB+A*A' = \n");
+	d_print_strmat(n, n, &sD, 0, 0);
+
+//	dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+	dgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+	printf("\nLU = \n");
+	d_print_strmat(n, n, &sLU, 0, 0);
+	printf("\nipiv = \n");
+	int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+	d_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+	printf("\nI = \n");
+	d_print_strmat(n, n, &sI, 0, 0);
+
+	drowpe_libstr(n, ipiv, &sI);
+	printf("\nperm(I) = \n");
+	d_print_strmat(n, n, &sI, 0, 0);
+
+	dtrsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+	printf("\nperm(inv(L)) = \n");
+	d_print_strmat(n, n, &sD, 0, 0);
+	dtrsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+	printf("\ninv(A) = \n");
+	d_print_strmat(n, n, &sD, 0, 0);
+
+	// convert from strmat to column major matrix
+	d_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+	d_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+	printf("\nI' = \n");
+	d_print_strmat(n, n, &sI, 0, 0);
+
+	dcolpe_libstr(n, ipiv, &sB);
+	printf("\nperm(I') = \n");
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	dtrsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+	printf("\nperm(inv(L')) = \n");
+	d_print_strmat(n, n, &sD, 0, 0);
+	dtrsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+	printf("\ninv(A') = \n");
+	d_print_strmat(n, n, &sD, 0, 0);
+
+	// convert from strmat to column major matrix
+	d_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+	// print matrix in column-major format
+	printf("\ninv(A) = \n");
+	d_print_mat(n, n, D, n);
+
+
+
+	//
+	// free memory
+	//
+
+	d_free(A);
+	d_free(B);
+	d_free(D);
+	d_free(I);
+	int_free(ipiv);
+//	d_free_strmat(&sA);
+//	d_free_strmat(&sB);
+//	d_free_strmat(&sD);
+//	d_free_strmat(&sI);
+	v_free_align(memory_strmat);
+
+	return 0;
+	
+	}
diff --git a/examples/example_d_riccati_recursion.c b/examples/example_d_riccati_recursion.c
new file mode 100644
index 0000000..1618ce9
--- /dev/null
+++ b/examples/example_d_riccati_recursion.c
@@ -0,0 +1,595 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+static void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// factorization and backward substitution
+
+	// last stage
+	dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+		dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+		dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+		dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+		dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+		}
+	
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+	dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+	dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+	daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+	dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+		dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+		dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+		daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+static void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hswork_mat)
+	{
+
+	int nn;
+
+	// factorization
+
+	// last stage
+	dpotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		dtrmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+		dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+		dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+		dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+		}
+	
+	return;
+
+	}
+
+
+
+static void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// backward substitution
+
+	// last stage
+	dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+	// middle stages
+	for(nn=0; nn<N-1; nn++)
+		{
+		// compute Pb
+		dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+		dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+		daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+		dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		}
+
+	// first stage
+	nn = N-1;
+	dtrmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	dtrmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+	dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+	daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+	dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+	dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+	dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+	dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+	daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+		dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		dtrmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+		dtrmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+		daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+/************************************************ 
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts. 
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+	{
+
+	int nx2 = nx*nx;
+
+	int info = 0;
+
+	int pp = nx/2; // number of masses
+	
+/************************************************
+* build the continuous time system 
+************************************************/
+	
+	double *T; d_zeros(&T, pp, pp);
+	int ii;
+	for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+	for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+	for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+	double *Z; d_zeros(&Z, pp, pp);
+	double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+	double *Ac; d_zeros(&Ac, nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac, nx);
+	dmcopy(pp, pp, T, pp, Ac+pp, nx);
+	dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx); 
+	free(T);
+	free(Z);
+	free(I);
+	
+	d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+	double *Bc; d_zeros(&Bc, nx, nu);
+	dmcopy(nu, nu, I, nu, Bc+pp, nx);
+	free(I);
+	
+/************************************************
+* compute the discrete time system 
+************************************************/
+
+	double *bb; d_zeros(&bb, nx, 1);
+	dmcopy(nx, 1, bb, nx, b, nx);
+		
+	dmcopy(nx, nx, Ac, nx, A, nx);
+	dscal_3l(nx2, Ts, A);
+	expm(nx, A);
+	
+	d_zeros(&T, nx, nx);
+	d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+	dmcopy(nx, nx, A, nx, T, nx);
+	daxpy_3l(nx2, -1.0, I, T);
+	dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+	free(T);
+	free(I);
+	
+	int *ipiv = (int *) malloc(nx*sizeof(int));
+	dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+	free(ipiv);
+
+	free(Ac);
+	free(Bc);
+	free(bb);
+	
+			
+/************************************************
+* initial state 
+************************************************/
+	
+	if(nx==4)
+		{
+		x0[0] = 5;
+		x0[1] = 10;
+		x0[2] = 15;
+		x0[3] = 20;
+		}
+	else
+		{
+		int jj;
+		for(jj=0; jj<nx; jj++)
+			x0[jj] = 1;
+		}
+
+	}
+
+
+
+int main()
+	{
+
+	printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	// loop index
+	int ii;
+
+/************************************************
+* problem size
+************************************************/	
+
+	// problem size
+	int N = 4;
+	int nx_ = 4;
+	int nu_ = 1;
+
+	// stage-wise variant size
+	int nx[N+1];
+	nx[0] = 0;
+	for(ii=1; ii<=N; ii++)
+		nx[ii] = nx_;
+	nx[N] = nx_;
+
+	int nu[N+1];
+	for(ii=0; ii<N; ii++)
+		nu[ii] = nu_;
+	nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/	
+
+	double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+	double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+	double *b; d_zeros(&b, nx_, 1); // states offset
+	double *x0; d_zeros(&x0, nx_, 1); // initial state
+
+	double Ts = 0.5; // sampling time
+	d_mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+	
+	for(ii=0; ii<nx_; ii++)
+		b[ii] = 0.1;
+	
+	for(ii=0; ii<nx_; ii++)
+		x0[ii] = 0;
+	x0[0] = 2.5;
+	x0[1] = 2.5;
+
+	d_print_mat(nx_, nx_, A, nx_);
+	d_print_mat(nx_, nu_, B, nx_);
+	d_print_mat(1, nx_, b, 1);
+	d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/	
+
+	double *R; d_zeros(&R, nu_, nu_);
+	for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+	double *S; d_zeros(&S, nu_, nx_);
+
+	double *Q; d_zeros(&Q, nx_, nx_);
+	for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+	double *r; d_zeros(&r, nu_, 1);
+	for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+	double *q; d_zeros(&q, nx_, 1);
+	for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+	d_print_mat(nu_, nu_, R, nu_);
+	d_print_mat(nu_, nx_, S, nu_);
+	d_print_mat(nx_, nx_, Q, nx_);
+	d_print_mat(1, nu_, r, 1);
+	d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/	
+
+	struct d_strmat sA;
+	d_allocate_strmat(nx_, nx_, &sA);
+	d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+	struct d_strvec sb;
+	d_allocate_strvec(nx_, &sb);
+	d_cvt_vec2strvec(nx_, b, &sb, 0);
+	struct d_strvec sx0;
+	d_allocate_strvec(nx_, &sx0);
+	d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+	struct d_strvec sb0;
+	d_allocate_strvec(nx_, &sb0);
+	double *b0; d_zeros(&b0, nx_, 1); // states offset
+	dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+	d_print_tran_strvec(nx_, &sb0, 0);
+
+	struct d_strmat sBbt0;
+	d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+	d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+	drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+	d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+	struct d_strmat sBAbt1;
+	d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+	d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+	d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+	d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+	d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+	struct d_strvec sr0; // XXX no need to update r0 since S=0
+	d_allocate_strvec(nu_, &sr0);
+	d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+	struct d_strmat sRr0;
+	d_allocate_strmat(nu_+1, nu_, &sRr0);
+	d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+	drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+	d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+	struct d_strvec srq1;
+	d_allocate_strvec(nu_+nx_, &srq1);
+	d_cvt_vec2strvec(nu_, r, &srq1, 0);
+	d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+	struct d_strmat sRSQrq1;
+	d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+	d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+	d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+	d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+	drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+	d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+	struct d_strvec sqN;
+	d_allocate_strvec(nx_, &sqN);
+	d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+	struct d_strmat sQqN;
+	d_allocate_strmat(nx_+1, nx_, &sQqN);
+	d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+	drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+	d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/	
+	
+	struct d_strmat hsBAbt[N];
+	struct d_strvec hsb[N];
+	struct d_strmat hsRSQrq[N+1];
+	struct d_strvec hsrq[N+1];
+	struct d_strmat hsL[N+1];
+	struct d_strvec hsPb[N];
+	struct d_strvec hsux[N+1];
+	struct d_strvec hspi[N];
+	struct d_strmat hswork_mat[1];
+	struct d_strvec hswork_vec[1];
+
+	hsBAbt[0] = sBbt0;
+	hsb[0] = sb0;
+	hsRSQrq[0] = sRr0;
+	hsrq[0] = sr0;
+	d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+	d_allocate_strvec(nx_, &hsPb[0]);
+	d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+	d_allocate_strvec(nx_, &hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		hsBAbt[ii] = sBAbt1;
+		hsb[ii] = sb;
+		hsRSQrq[ii] = sRSQrq1;
+		hsrq[ii] = srq1;
+		d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+		d_allocate_strvec(nx_, &hsPb[ii]);
+		d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+		d_allocate_strvec(nx_, &hspi[ii]);
+		}
+	hsRSQrq[N] = sQqN;
+	hsrq[N] = sqN;
+	d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+	d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+	d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+	d_allocate_strvec(nx_, &hswork_vec[0]);
+
+//	for(ii=0; ii<N; ii++)
+//		d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+//	return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/	
+	
+	// timing 
+	struct timeval tv0, tv1, tv2, tv3;
+	int nrep = 1000;
+	int rep;
+
+	gettimeofday(&tv0, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+		}
+
+	gettimeofday(&tv1, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+		}
+
+	gettimeofday(&tv2, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+		}
+
+	gettimeofday(&tv3, NULL); // time
+
+	float time_sv  = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+	float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+	float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+	// print sol
+	printf("\nux = \n\n");
+	for(ii=0; ii<=N; ii++)
+		d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+	printf("\npi = \n\n");
+	for(ii=0; ii<N; ii++)
+		d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+//	printf("\nL = \n\n");
+//	for(ii=0; ii<=N; ii++)
+//		d_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+	printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+	printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+	printf("\n");
+
+/************************************************
+* free memory
+************************************************/	
+
+	d_free(A);
+	d_free(B);
+	d_free(b);
+	d_free(x0);
+	d_free(R);
+	d_free(S);
+	d_free(Q);
+	d_free(r);
+	d_free(q);
+	d_free(b0);
+	d_free_strmat(&sA);
+	d_free_strvec(&sb);
+	d_free_strmat(&sBbt0);
+	d_free_strvec(&sb0);
+	d_free_strmat(&sBAbt1);
+	d_free_strmat(&sRr0);
+	d_free_strvec(&sr0);
+	d_free_strmat(&sRSQrq1);
+	d_free_strvec(&srq1);
+	d_free_strmat(&sQqN);
+	d_free_strvec(&sqN);
+	d_free_strmat(&hsL[0]);
+	d_free_strvec(&hsPb[0]);
+	d_free_strvec(&hsux[0]);
+	d_free_strvec(&hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		d_free_strmat(&hsL[ii]);
+		d_free_strvec(&hsPb[ii]);
+		d_free_strvec(&hsux[ii]);
+		d_free_strvec(&hspi[ii]);
+		}
+	d_free_strmat(&hsL[N]);
+	d_free_strvec(&hsux[N]);
+	d_free_strmat(&hswork_mat[0]);
+	d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/	
+
+	return 0;
+
+	}
+
+
+
diff --git a/examples/example_s_lu_factorization.c b/examples/example_s_lu_factorization.c
new file mode 100644
index 0000000..e298604
--- /dev/null
+++ b/examples/example_s_lu_factorization.c
@@ -0,0 +1,211 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+	{
+
+	printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+
+	float *A; s_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	s_print_mat(n, n, A, n);
+
+	// spd matrix
+	float *B; s_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	s_print_mat(n, n, B, n);
+
+	// identity
+	float *I; s_zeros(&I, n, n);
+	for(ii=0; ii<n; ii++) I[ii*(n+1)] = 1.0;
+//	s_print_mat(n, n, B, n);
+
+	// result matrix
+	float *D; s_zeros(&D, n, n);
+//	s_print_mat(n, n, D, n);
+
+	// permutation indeces
+	int *ipiv; int_zeros(&ipiv, n, 1);
+
+	//
+	// matrices in matrix struct format
+	//
+
+	// work space enough for 5 matrix structs for size n times n
+	int size_strmat = 5*s_size_strmat(n, n);
+	void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+	char *ptr_memory_strmat = (char *) memory_strmat;
+
+	struct s_strmat sA;
+//	s_allocate_strmat(n, n, &sA);
+	s_create_strmat(n, n, &sA, ptr_memory_strmat);
+	ptr_memory_strmat += sA.memory_size;
+	// convert from column major matrix to strmat
+	s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	printf("\nA = \n");
+	s_print_strmat(n, n, &sA, 0, 0);
+
+	struct s_strmat sB;
+//	s_allocate_strmat(n, n, &sB);
+	s_create_strmat(n, n, &sB, ptr_memory_strmat);
+	ptr_memory_strmat += sB.memory_size;
+	// convert from column major matrix to strmat
+	s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	printf("\nB = \n");
+	s_print_strmat(n, n, &sB, 0, 0);
+
+	struct s_strmat sI;
+//	s_allocate_strmat(n, n, &sI);
+	s_create_strmat(n, n, &sI, ptr_memory_strmat);
+	ptr_memory_strmat += sI.memory_size;
+	// convert from column major matrix to strmat
+
+	struct s_strmat sD;
+//	s_allocate_strmat(n, n, &sD);
+	s_create_strmat(n, n, &sD, ptr_memory_strmat);
+	ptr_memory_strmat += sD.memory_size;
+
+	struct s_strmat sLU;
+//	s_allocate_strmat(n, n, &sD);
+	s_create_strmat(n, n, &sLU, ptr_memory_strmat);
+	ptr_memory_strmat += sLU.memory_size;
+
+	sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+	printf("\nB+A*A' = \n");
+	s_print_strmat(n, n, &sD, 0, 0);
+
+//	sgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+	sgetrf_libstr(n, n, &sD, 0, 0, &sLU, 0, 0, ipiv);
+	printf("\nLU = \n");
+	s_print_strmat(n, n, &sLU, 0, 0);
+	printf("\nipiv = \n");
+	int_print_mat(1, n, ipiv, 1);
+
+#if 0 // solve P L U X = P B
+	s_cvt_mat2strmat(n, n, I, n, &sI, 0, 0);
+	printf("\nI = \n");
+	s_print_strmat(n, n, &sI, 0, 0);
+
+	srowpe_libstr(n, ipiv, &sI);
+	printf("\nperm(I) = \n");
+	s_print_strmat(n, n, &sI, 0, 0);
+
+	strsm_llnu_libstr(n, n, 1.0, &sLU, 0, 0, &sI, 0, 0, &sD, 0, 0);
+	printf("\nperm(inv(L)) = \n");
+	s_print_strmat(n, n, &sD, 0, 0);
+	strsm_lunn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+	printf("\ninv(A) = \n");
+	s_print_strmat(n, n, &sD, 0, 0);
+
+	// convert from strmat to column major matrix
+	s_cvt_strmat2mat(n, n, &sD, 0, 0, D, n);
+#else // solve X^T (P L U)^T = B^T P^T
+	s_cvt_tran_mat2strmat(n, n, I, n, &sI, 0, 0);
+	printf("\nI' = \n");
+	s_print_strmat(n, n, &sI, 0, 0);
+
+	scolpe_libstr(n, ipiv, &sB);
+	printf("\nperm(I') = \n");
+	s_print_strmat(n, n, &sB, 0, 0);
+
+	strsm_rltu_libstr(n, n, 1.0, &sLU, 0, 0, &sB, 0, 0, &sD, 0, 0);
+	printf("\nperm(inv(L')) = \n");
+	s_print_strmat(n, n, &sD, 0, 0);
+	strsm_rutn_libstr(n, n, 1.0, &sLU, 0, 0, &sD, 0, 0, &sD, 0, 0);
+	printf("\ninv(A') = \n");
+	s_print_strmat(n, n, &sD, 0, 0);
+
+	// convert from strmat to column major matrix
+	s_cvt_tran_strmat2mat(n, n, &sD, 0, 0, D, n);
+#endif
+
+	// print matrix in column-major format
+	printf("\ninv(A) = \n");
+	s_print_mat(n, n, D, n);
+
+
+
+	//
+	// free memory
+	//
+
+	s_free(A);
+	s_free(B);
+	s_free(D);
+	s_free(I);
+	int_free(ipiv);
+//	s_free_strmat(&sA);
+//	s_free_strmat(&sB);
+//	s_free_strmat(&sD);
+//	s_free_strmat(&sI);
+	v_free_align(memory_strmat);
+
+	return 0;
+	
+	}
+
diff --git a/examples/example_s_riccati_recursion.c b/examples/example_s_riccati_recursion.c
new file mode 100644
index 0000000..03b9fc6
--- /dev/null
+++ b/examples/example_s_riccati_recursion.c
@@ -0,0 +1,605 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+
+static void s_back_ric_sv_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strmat *hswork_mat, struct s_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// factorization and backward substitution
+
+	// last stage
+	spotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+		sgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+		ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+		ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+		spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+		}
+	
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	srowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+	strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+	sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+	sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+	saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+	strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		srowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+		strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		srowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+		sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+		sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		srowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+		saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hspi[nn], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+static void s_back_ric_trf_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strmat *hsRSQrq, struct s_strmat *hsL, struct s_strmat *hswork_mat)
+	{
+
+	int nn;
+
+	// factorization
+
+	// last stage
+	spotrf_l_libstr(nx[N], nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		strmm_rlnn_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn], nu[N-nn], &hsBAbt[N-nn-1], 0, 0, &hswork_mat[0], 0, 0);
+#if 1
+		ssyrk_spotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+		ssyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+		spotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+		}
+	
+	return;
+
+	}
+
+
+
+static void s_back_ric_trs_libstr(int N, int *nx, int *nu, struct s_strmat *hsBAbt, struct s_strvec *hsb, struct s_strvec *hsrq, struct s_strmat *hsL, struct s_strvec *hsPb, struct s_strvec *hsux, struct s_strvec *hspi, struct s_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// backward substitution
+
+	// last stage
+	sveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+	// middle stages
+	for(nn=0; nn<N-1; nn++)
+		{
+		// compute Pb
+		strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+		sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+		saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+		sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		}
+
+	// first stage
+	nn = N-1;
+	strmv_ltn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	strmv_lnn_libstr(nx[N-nn], nx[N-nn], &hsL[N-nn], nu[N-nn], nu[N-nn], &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	sveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+	sveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+	saxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+	sgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+	strsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	sveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+	strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+	sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+	strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+	saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		sveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+		strsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		sgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+		sveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		strmv_ltn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+		strmv_lnn_libstr(nx[nn+1], nx[nn+1], &hsL[nn+1], nu[nn+1], nu[nn+1], &hswork_vec[0], 0, &hswork_vec[0], 0);
+		saxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+/************************************************ 
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts. 
+************************************************/
+static void d_mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+	{
+
+	int nx2 = nx*nx;
+
+	int info = 0;
+
+	int pp = nx/2; // number of masses
+	
+/************************************************
+* build the continuous time system 
+************************************************/
+	
+	double *T; d_zeros(&T, pp, pp);
+	int ii;
+	for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+	for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+	for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+	double *Z; d_zeros(&Z, pp, pp);
+	double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+	double *Ac; d_zeros(&Ac, nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac, nx);
+	dmcopy(pp, pp, T, pp, Ac+pp, nx);
+	dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx); 
+	free(T);
+	free(Z);
+	free(I);
+	
+	d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+	double *Bc; d_zeros(&Bc, nx, nu);
+	dmcopy(nu, nu, I, nu, Bc+pp, nx);
+	free(I);
+	
+/************************************************
+* compute the discrete time system 
+************************************************/
+
+	double *bb; d_zeros(&bb, nx, 1);
+	dmcopy(nx, 1, bb, nx, b, nx);
+		
+	dmcopy(nx, nx, Ac, nx, A, nx);
+	dscal_3l(nx2, Ts, A);
+	expm(nx, A);
+	
+	d_zeros(&T, nx, nx);
+	d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+	dmcopy(nx, nx, A, nx, T, nx);
+	daxpy_3l(nx2, -1.0, I, T);
+	dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+	free(T);
+	free(I);
+	
+	int *ipiv = (int *) malloc(nx*sizeof(int));
+	dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+	free(ipiv);
+
+	free(Ac);
+	free(Bc);
+	free(bb);
+	
+			
+/************************************************
+* initial state 
+************************************************/
+	
+	if(nx==4)
+		{
+		x0[0] = 5;
+		x0[1] = 10;
+		x0[2] = 15;
+		x0[3] = 20;
+		}
+	else
+		{
+		int jj;
+		for(jj=0; jj<nx; jj++)
+			x0[jj] = 1;
+		}
+
+	}
+
+
+
+int main()
+	{
+
+	printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	// loop index
+	int ii;
+
+/************************************************
+* problem size
+************************************************/	
+
+	// problem size
+	int N = 4;
+	int nx_ = 4;
+	int nu_ = 1;
+
+	// stage-wise variant size
+	int nx[N+1];
+	nx[0] = 0;
+	for(ii=1; ii<=N; ii++)
+		nx[ii] = nx_;
+	nx[N] = nx_;
+
+	int nu[N+1];
+	for(ii=0; ii<N; ii++)
+		nu[ii] = nu_;
+	nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/	
+
+	double *Ad; d_zeros(&Ad, nx_, nx_); // states update matrix
+
+	double *Bd; d_zeros(&Bd, nx_, nu_); // inputs matrix
+
+	double *bd; d_zeros(&bd, nx_, 1); // states offset
+	double *x0d; d_zeros(&x0d, nx_, 1); // initial state
+
+	double Ts = 0.5; // sampling time
+	d_mass_spring_system(Ts, nx_, nu_, N, Ad, Bd, bd, x0d);
+
+	float *A; s_zeros(&A, nx_, nx_); for(ii=0; ii<nx_*nx_; ii++) A[ii] = (float) Ad[ii];
+	float *B; s_zeros(&B, nx_, nu_); for(ii=0; ii<nx_*nu_; ii++) B[ii] = (float) Bd[ii];
+	float *b; s_zeros(&b, nx_, 1); for(ii=0; ii<nx_; ii++) b[ii] = (float) bd[ii];
+	float *x0; s_zeros(&x0, nx_, 1); for(ii=0; ii<nx_; ii++) x0[ii] = (float) x0d[ii];
+	
+	for(ii=0; ii<nx_; ii++)
+		b[ii] = 0.1;
+	
+	for(ii=0; ii<nx_; ii++)
+		x0[ii] = 0;
+	x0[0] = 2.5;
+	x0[1] = 2.5;
+
+	s_print_mat(nx_, nx_, A, nx_);
+	s_print_mat(nx_, nu_, B, nx_);
+	s_print_mat(1, nx_, b, 1);
+	s_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/	
+
+	float *R; s_zeros(&R, nu_, nu_);
+	for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+	float *S; s_zeros(&S, nu_, nx_);
+
+	float *Q; s_zeros(&Q, nx_, nx_);
+	for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+	float *r; s_zeros(&r, nu_, 1);
+	for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+	float *q; s_zeros(&q, nx_, 1);
+	for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+	s_print_mat(nu_, nu_, R, nu_);
+	s_print_mat(nu_, nx_, S, nu_);
+	s_print_mat(nx_, nx_, Q, nx_);
+	s_print_mat(1, nu_, r, 1);
+	s_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/	
+
+	struct s_strmat sA;
+	s_allocate_strmat(nx_, nx_, &sA);
+	s_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+	struct s_strvec sb;
+	s_allocate_strvec(nx_, &sb);
+	s_cvt_vec2strvec(nx_, b, &sb, 0);
+	struct s_strvec sx0;
+	s_allocate_strvec(nx_, &sx0);
+	s_cvt_vec2strvec(nx_, x0, &sx0, 0);
+	struct s_strvec sb0;
+	s_allocate_strvec(nx_, &sb0);
+	float *b0; d_zeros(&b0, nx_, 1); // states offset
+	sgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+	s_print_tran_strvec(nx_, &sb0, 0);
+
+	struct s_strmat sBbt0;
+	s_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+	s_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+	srowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+	s_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+	struct s_strmat sBAbt1;
+	s_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+	s_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+	s_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+	s_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+	s_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+	struct s_strvec sr0; // XXX no need to update r0 since S=0
+	s_allocate_strvec(nu_, &sr0);
+	s_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+	struct s_strmat sRr0;
+	s_allocate_strmat(nu_+1, nu_, &sRr0);
+	s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+	srowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+	s_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+	struct s_strvec srq1;
+	s_allocate_strvec(nu_+nx_, &srq1);
+	s_cvt_vec2strvec(nu_, r, &srq1, 0);
+	s_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+	struct s_strmat sRSQrq1;
+	s_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+	s_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+	s_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+	s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+	srowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+	s_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+	struct s_strvec sqN;
+	s_allocate_strvec(nx_, &sqN);
+	s_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+	struct s_strmat sQqN;
+	s_allocate_strmat(nx_+1, nx_, &sQqN);
+	s_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+	srowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+	s_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/	
+	
+	struct s_strmat hsBAbt[N];
+	struct s_strvec hsb[N];
+	struct s_strmat hsRSQrq[N+1];
+	struct s_strvec hsrq[N+1];
+	struct s_strmat hsL[N+1];
+	struct s_strvec hsPb[N];
+	struct s_strvec hsux[N+1];
+	struct s_strvec hspi[N];
+	struct s_strmat hswork_mat[1];
+	struct s_strvec hswork_vec[1];
+
+	hsBAbt[0] = sBbt0;
+	hsb[0] = sb0;
+	hsRSQrq[0] = sRr0;
+	hsrq[0] = sr0;
+	s_allocate_strmat(nu_+1, nu_, &hsL[0]);
+	s_allocate_strvec(nx_, &hsPb[0]);
+	s_allocate_strvec(nx_+nu_+1, &hsux[0]);
+	s_allocate_strvec(nx_, &hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		hsBAbt[ii] = sBAbt1;
+		hsb[ii] = sb;
+		hsRSQrq[ii] = sRSQrq1;
+		hsrq[ii] = srq1;
+		s_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+		s_allocate_strvec(nx_, &hsPb[ii]);
+		s_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+		s_allocate_strvec(nx_, &hspi[ii]);
+		}
+	hsRSQrq[N] = sQqN;
+	hsrq[N] = sqN;
+	s_allocate_strmat(nx_+1, nx_, &hsL[N]);
+	s_allocate_strvec(nx_+nu_+1, &hsux[N]);
+	s_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+	s_allocate_strvec(nx_, &hswork_vec[0]);
+
+//	for(ii=0; ii<N; ii++)
+//		d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+//	return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/	
+	
+	// timing 
+	struct timeval tv0, tv1, tv2, tv3;
+	int nrep = 1000;
+	int rep;
+
+	gettimeofday(&tv0, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		s_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsux, hspi, hswork_mat, hswork_vec);
+		}
+
+	gettimeofday(&tv1, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		s_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hswork_mat);
+		}
+
+	gettimeofday(&tv2, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		s_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsPb, hsux, hspi, hswork_vec);
+		}
+
+	gettimeofday(&tv3, NULL); // time
+
+	float time_sv  = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+	float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+	float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+	// print sol
+	printf("\nux = \n\n");
+	for(ii=0; ii<=N; ii++)
+		s_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+	printf("\npi = \n\n");
+	for(ii=0; ii<N; ii++)
+		s_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+//	printf("\nL = \n\n");
+//	for(ii=0; ii<=N; ii++)
+//		s_print_strmat(nu[ii]+nx[ii]+1, nu[ii]+nx[ii], &hsL[ii], 0, 0);
+
+	printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+	printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+	printf("\n");
+
+/************************************************
+* free memory
+************************************************/	
+
+	d_free(Ad);
+	d_free(Bd);
+	d_free(bd);
+	d_free(x0d);
+	s_free(A);
+	s_free(B);
+	s_free(b);
+	s_free(x0);
+	s_free(R);
+	s_free(S);
+	s_free(Q);
+	s_free(r);
+	s_free(q);
+	s_free(b0);
+	s_free_strmat(&sA);
+	s_free_strvec(&sb);
+	s_free_strmat(&sBbt0);
+	s_free_strvec(&sb0);
+	s_free_strmat(&sBAbt1);
+	s_free_strmat(&sRr0);
+	s_free_strvec(&sr0);
+	s_free_strmat(&sRSQrq1);
+	s_free_strvec(&srq1);
+	s_free_strmat(&sQqN);
+	s_free_strvec(&sqN);
+	s_free_strmat(&hsL[0]);
+	s_free_strvec(&hsPb[0]);
+	s_free_strvec(&hsux[0]);
+	s_free_strvec(&hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		s_free_strmat(&hsL[ii]);
+		s_free_strvec(&hsPb[ii]);
+		s_free_strvec(&hsux[ii]);
+		s_free_strvec(&hspi[ii]);
+		}
+	s_free_strmat(&hsL[N]);
+	s_free_strvec(&hsux[N]);
+	s_free_strmat(&hswork_mat[0]);
+	s_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/	
+
+	return 0;
+
+	}
+
+
+
+
diff --git a/examples/example_tree_riccati_recursion.c b/examples/example_tree_riccati_recursion.c
new file mode 100644
index 0000000..b61d2d3
--- /dev/null
+++ b/examples/example_tree_riccati_recursion.c
@@ -0,0 +1,638 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tools.h"
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+
+void d_back_ric_sv_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strmat *hswork_mat, struct d_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// factorization and backward substitution
+
+	// last stage
+	dpotrf_l_libstr(nx[N]+1, nx[N], &hsRSQrq[N], 0, 0, &hsL[N], 0, 0);
+	dtrtr_l_libstr(nx[N], &hsL[N], 0, 0, &hsLxt[N], 0, 0);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		dtrmm_rutn_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hsLxt[N-nn], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+		dgead_libstr(1, nx[N-nn], 1.0, &hsL[N-nn], nu[N-nn]+nx[N-nn], nu[N-nn], &hswork_mat[0], nu[N-nn-1]+nx[N-nn-1], 0);
+#if 1
+		dsyrk_dpotrf_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#else
+		dsyrk_ln_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+		dpotrf_l_libstr(nu[N-nn-1]+nx[N-nn-1]+1, nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsL[N-nn-1], 0, 0);
+#endif
+		dtrtr_l_libstr(nx[N-nn-1], &hsL[N-nn-1], nu[N-nn-1], nu[N-nn-1], &hsLxt[N-nn-1], 0, 0);
+		}
+	
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	drowex_libstr(nu[nn]+nx[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+	dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+	dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+	daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+	dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		drowex_libstr(nu[nn], -1.0, &hsL[nn], nu[nn]+nx[nn], 0, &hsux[nn], 0);
+		dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		drowex_libstr(nx[nn+1], 1.0, &hsBAbt[nn], nu[nn]+nx[nn], 0, &hsux[nn+1], nu[nn+1]);
+		dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsux[nn+1], nu[nn+1], &hsux[nn+1], nu[nn+1]);
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		drowex_libstr(nx[nn+1], 1.0, &hsL[nn+1], nu[nn+1]+nx[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+		daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hspi[nn], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+void d_back_ric_trf_funnel1_libstr(int md, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt_old, struct d_strmat *hsLxt_new, struct d_strmat *hswork_mat)
+	{
+
+	int ii;
+
+	ii = 0;
+	dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+	dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+	for(ii=1; ii<md; ii++)
+		{
+		dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[ii], 0, 0, &hsLxt_old[ii], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+		dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsL[0], 0, 0, &hsL[0], 0, 0);
+		}
+
+	dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+	dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt_new[0], 0, 0);
+
+	return;
+
+	}
+
+
+
+void d_back_ric_trf_step1_libstr(int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+	{
+
+	dtrmm_rutn_libstr(nu[0]+nx[0], nx[1], 1.0, &hsBAbt[0], 0, 0, &hsLxt[1], 0, 0, 0.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0);
+	dsyrk_ln_libstr(nu[0]+nx[0], nu[0]+nx[0], nx[1], 1.0, &hswork_mat[0], 0, 0, &hswork_mat[0], 0, 0, 1.0, &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+	dpotrf_l_libstr(nu[0]+nx[0], nu[0]+nx[0], &hsL[0], 0, 0, &hsL[0], 0, 0);
+	dtrtr_l_libstr(nx[0], &hsL[0], nu[0], nu[0], &hsLxt[0], 0, 0);
+
+	return;
+
+	}
+
+
+
+void d_back_ric_trf_stepN_libstr(int *nx, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt)
+	{
+
+	dpotrf_l_libstr(nx[0], nx[0], &hsRSQrq[0], 0, 0, &hsL[0], 0, 0);
+	dtrtr_l_libstr(nx[0], &hsL[0], 0, 0, &hsLxt[0], 0, 0);
+
+	return;
+
+	}
+
+
+
+void d_back_ric_trf_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strmat *hsRSQrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strmat *hswork_mat)
+	{
+
+	int nn;
+
+	// factorization
+
+	// last stage
+	d_back_ric_trf_stepN_libstr(&nx[N], &hsRSQrq[N], &hsL[N], &hsLxt[N]);
+
+	// middle stages
+	for(nn=0; nn<N; nn++)
+		{
+		d_back_ric_trf_step1_libstr(&nx[N-nn-1], &nu[N-nn-1], &hsBAbt[N-nn-1], &hsRSQrq[N-nn-1], &hsL[N-nn-1], &hsLxt[N-nn-1], hswork_mat);
+		}
+	
+	return;
+
+	}
+
+
+
+void d_back_ric_trs_libstr(int N, int *nx, int *nu, struct d_strmat *hsBAbt, struct d_strvec *hsb, struct d_strvec *hsrq, struct d_strmat *hsL, struct d_strmat *hsLxt, struct d_strvec *hsPb, struct d_strvec *hsux, struct d_strvec *hspi, struct d_strvec *hswork_vec)
+	{
+
+	int nn;
+
+	// backward substitution
+
+	// last stage
+	dveccp_libstr(nu[N]+nx[N], 1.0, &hsrq[N], 0, &hsux[N], 0);
+
+	// middle stages
+	for(nn=0; nn<N-1; nn++)
+		{
+		// compute Pb
+		dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+		dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+		dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+		daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+		dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+		}
+
+	// first stage
+	nn = N-1;
+	dtrmv_unn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	dtrmv_utn_libstr(nx[N-nn], &hsLxt[N-nn], 0, 0, &hsPb[N-nn-1], 0, &hsPb[N-nn-1], 0);
+	dveccp_libstr(nu[N-nn-1]+nx[N-nn-1], 1.0, &hsrq[N-nn-1], 0, &hsux[N-nn-1], 0);
+	dveccp_libstr(nx[N-nn], 1.0, &hsPb[N-nn-1], 0, &hswork_vec[0], 0);
+	daxpy_libstr(nx[N-nn], 1.0, &hsux[N-nn], nu[N-nn], &hswork_vec[0], 0);
+	dgemv_n_libstr(nu[N-nn-1]+nx[N-nn-1], nx[N-nn], 1.0, &hsBAbt[N-nn-1], 0, 0, &hswork_vec[0], 0, 1.0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+	dtrsv_lnn_libstr(nu[N-nn-1]+nx[N-nn-1], nu[N-nn-1]+nx[N-nn-1], &hsL[N-nn-1], 0, 0, &hsux[N-nn-1], 0, &hsux[N-nn-1], 0);
+
+	// forward substitution
+
+	// first stage
+	nn = 0;
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+	dveccp_libstr(nu[nn]+nx[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+	dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn]+nx[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+	dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+	dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+	dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+	dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+	daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+
+	// middle stages
+	for(nn=1; nn<N; nn++)
+		{
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hspi[nn], 0);
+		dveccp_libstr(nu[nn], -1.0, &hsux[nn], 0, &hsux[nn], 0);
+		dtrsv_ltn_libstr(nu[nn]+nx[nn], nu[nn], &hsL[nn], 0, 0, &hsux[nn], 0, &hsux[nn], 0);
+		dgemv_t_libstr(nu[nn]+nx[nn], nx[nn+1], 1.0, &hsBAbt[nn], 0, 0, &hsux[nn], 0, 1.0, &hsb[nn], 0, &hsux[nn+1], nu[nn+1]);
+		dveccp_libstr(nx[nn+1], 1.0, &hsux[nn+1], nu[nn+1], &hswork_vec[0], 0);
+		dtrmv_unn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+		dtrmv_utn_libstr(nx[nn+1], &hsLxt[nn+1], 0, 0, &hswork_vec[0], 0, &hswork_vec[0], 0);
+		daxpy_libstr(nx[nn+1], 1.0, &hswork_vec[0], 0, &hspi[nn], 0);
+		}
+
+	return;
+
+	}
+
+
+
+/************************************************ 
+Mass-spring system: nx/2 masses connected each other with springs (in a row), and the first and the last one to walls. nu (<=nx) controls act on the first nu masses. The system is sampled with sampling time Ts. 
+************************************************/
+void mass_spring_system(double Ts, int nx, int nu, int N, double *A, double *B, double *b, double *x0)
+	{
+
+	int nx2 = nx*nx;
+
+	int info = 0;
+
+	int pp = nx/2; // number of masses
+	
+/************************************************
+* build the continuous time system 
+************************************************/
+	
+	double *T; d_zeros(&T, pp, pp);
+	int ii;
+	for(ii=0; ii<pp; ii++) T[ii*(pp+1)] = -2;
+	for(ii=0; ii<pp-1; ii++) T[ii*(pp+1)+1] = 1;
+	for(ii=1; ii<pp; ii++) T[ii*(pp+1)-1] = 1;
+
+	double *Z; d_zeros(&Z, pp, pp);
+	double *I; d_zeros(&I, pp, pp); for(ii=0; ii<pp; ii++) I[ii*(pp+1)]=1.0; // = eye(pp);
+	double *Ac; d_zeros(&Ac, nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac, nx);
+	dmcopy(pp, pp, T, pp, Ac+pp, nx);
+	dmcopy(pp, pp, I, pp, Ac+pp*nx, nx);
+	dmcopy(pp, pp, Z, pp, Ac+pp*(nx+1), nx); 
+	free(T);
+	free(Z);
+	free(I);
+	
+	d_zeros(&I, nu, nu); for(ii=0; ii<nu; ii++) I[ii*(nu+1)]=1.0; //I = eye(nu);
+	double *Bc; d_zeros(&Bc, nx, nu);
+	dmcopy(nu, nu, I, nu, Bc+pp, nx);
+	free(I);
+	
+/************************************************
+* compute the discrete time system 
+************************************************/
+
+	double *bb; d_zeros(&bb, nx, 1);
+	dmcopy(nx, 1, bb, nx, b, nx);
+		
+	dmcopy(nx, nx, Ac, nx, A, nx);
+	dscal_3l(nx2, Ts, A);
+	expm(nx, A);
+	
+	d_zeros(&T, nx, nx);
+	d_zeros(&I, nx, nx); for(ii=0; ii<nx; ii++) I[ii*(nx+1)]=1.0; //I = eye(nx);
+	dmcopy(nx, nx, A, nx, T, nx);
+	daxpy_3l(nx2, -1.0, I, T);
+	dgemm_nn_3l(nx, nu, nx, T, nx, Bc, nx, B, nx);
+	free(T);
+	free(I);
+	
+	int *ipiv = (int *) malloc(nx*sizeof(int));
+	dgesv_3l(nx, nu, Ac, nx, ipiv, B, nx, &info);
+	free(ipiv);
+
+	free(Ac);
+	free(Bc);
+	free(bb);
+	
+			
+/************************************************
+* initial state 
+************************************************/
+	
+	if(nx==4)
+		{
+		x0[0] = 5;
+		x0[1] = 10;
+		x0[2] = 15;
+		x0[3] = 20;
+		}
+	else
+		{
+		int jj;
+		for(jj=0; jj<nx; jj++)
+			x0[jj] = 1;
+		}
+
+	}
+
+
+
+int main()
+	{
+
+	printf("\nExample of LU factorization and backsolve\n\n");
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by BLASFEO\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	// loop index
+	int ii;
+
+/************************************************
+* problem size
+************************************************/	
+
+	// problem size
+	int N = 4;
+	int nx_ = 8;
+	int nu_ = 3;
+
+	// stage-wise variant size
+	int nx[N+1];
+	nx[0] = 0;
+	for(ii=1; ii<=N; ii++)
+		nx[ii] = nx_;
+	nx[N] = nx_;
+
+	int nu[N+1];
+	for(ii=0; ii<N; ii++)
+		nu[ii] = nu_;
+	nu[N] = 0;
+
+/************************************************
+* dynamical system
+************************************************/	
+
+	double *A; d_zeros(&A, nx_, nx_); // states update matrix
+
+	double *B; d_zeros(&B, nx_, nu_); // inputs matrix
+
+	double *b; d_zeros(&b, nx_, 1); // states offset
+	double *x0; d_zeros_align(&x0, nx_, 1); // initial state
+
+	double Ts = 0.5; // sampling time
+	mass_spring_system(Ts, nx_, nu_, N, A, B, b, x0);
+	
+	for(ii=0; ii<nx_; ii++)
+		b[ii] = 0.1;
+	
+	for(ii=0; ii<nx_; ii++)
+		x0[ii] = 0;
+	x0[0] = 2.5;
+	x0[1] = 2.5;
+
+	d_print_mat(nx_, nx_, A, nx_);
+	d_print_mat(nx_, nu_, B, nx_);
+	d_print_mat(1, nx_, b, 1);
+	d_print_mat(1, nx_, x0, 1);
+
+/************************************************
+* cost function
+************************************************/	
+
+	double *R; d_zeros(&R, nu_, nu_);
+	for(ii=0; ii<nu_; ii++) R[ii*(nu_+1)] = 2.0;
+
+	double *S; d_zeros(&S, nu_, nx_);
+
+	double *Q; d_zeros(&Q, nx_, nx_);
+	for(ii=0; ii<nx_; ii++) Q[ii*(nx_+1)] = 1.0;
+
+	double *r; d_zeros(&r, nu_, 1);
+	for(ii=0; ii<nu_; ii++) r[ii] = 0.2;
+
+	double *q; d_zeros(&q, nx_, 1);
+	for(ii=0; ii<nx_; ii++) q[ii] = 0.1;
+
+	d_print_mat(nu_, nu_, R, nu_);
+	d_print_mat(nu_, nx_, S, nu_);
+	d_print_mat(nx_, nx_, Q, nx_);
+	d_print_mat(1, nu_, r, 1);
+	d_print_mat(1, nx_, q, 1);
+
+/************************************************
+* matrices as strmat
+************************************************/	
+
+	struct d_strmat sA;
+	d_allocate_strmat(nx_, nx_, &sA);
+	d_cvt_mat2strmat(nx_, nx_, A, nx_, &sA, 0, 0);
+	struct d_strvec sb;
+	d_allocate_strvec(nx_, &sb);
+	d_cvt_vec2strvec(nx_, b, &sb, 0);
+	struct d_strvec sx0;
+	d_allocate_strvec(nx_, &sx0);
+	d_cvt_vec2strvec(nx_, x0, &sx0, 0);
+	struct d_strvec sb0;
+	d_allocate_strvec(nx_, &sb0);
+	double *b0; d_zeros(&b0, nx_, 1); // states offset
+	dgemv_n_libstr(nx_, nx_, 1.0, &sA, 0, 0, &sx0, 0, 1.0, &sb, 0, &sb0, 0);
+	d_print_tran_strvec(nx_, &sb0, 0);
+
+	struct d_strmat sBbt0;
+	d_allocate_strmat(nu_+nx_+1, nx_, &sBbt0);
+	d_cvt_tran_mat2strmat(nx_, nx_, B, nx_, &sBbt0, 0, 0);
+	drowin_libstr(nx_, 1.0, &sb0, 0, &sBbt0, nu_, 0);
+	d_print_strmat(nu_+1, nx_, &sBbt0, 0, 0);
+
+	struct d_strmat sBAbt1;
+	d_allocate_strmat(nu_+nx_+1, nx_, &sBAbt1);
+	d_cvt_tran_mat2strmat(nx_, nu_, B, nx_, &sBAbt1, 0, 0);
+	d_cvt_tran_mat2strmat(nx_, nx_, A, nx_, &sBAbt1, nu_, 0);
+	d_cvt_tran_mat2strmat(nx_, 1, b, nx_, &sBAbt1, nu_+nx_, 0);
+	d_print_strmat(nu_+nx_+1, nx_, &sBAbt1, 0, 0);
+
+	struct d_strvec sr0; // XXX no need to update r0 since S=0
+	d_allocate_strvec(nu_, &sr0);
+	d_cvt_vec2strvec(nu_, r, &sr0, 0);
+
+	struct d_strmat sRr0;
+	d_allocate_strmat(nu_+1, nu_, &sRr0);
+	d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRr0, 0, 0);
+	drowin_libstr(nu_, 1.0, &sr0, 0, &sRr0, nu_, 0);
+	d_print_strmat(nu_+1, nu_, &sRr0, 0, 0);
+
+	struct d_strvec srq1;
+	d_allocate_strvec(nu_+nx_, &srq1);
+	d_cvt_vec2strvec(nu_, r, &srq1, 0);
+	d_cvt_vec2strvec(nx_, q, &srq1, nu_);
+
+	struct d_strmat sRSQrq1;
+	d_allocate_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1);
+	d_cvt_mat2strmat(nu_, nu_, R, nu_, &sRSQrq1, 0, 0);
+	d_cvt_tran_mat2strmat(nu_, nx_, S, nu_, &sRSQrq1, nu_, 0);
+	d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sRSQrq1, nu_, nu_);
+	drowin_libstr(nu_+nx_, 1.0, &srq1, 0, &sRSQrq1, nu_+nx_, 0);
+	d_print_strmat(nu_+nx_+1, nu_+nx_, &sRSQrq1, 0, 0);
+
+	struct d_strvec sqN;
+	d_allocate_strvec(nx_, &sqN);
+	d_cvt_vec2strvec(nx_, q, &sqN, 0);
+
+	struct d_strmat sQqN;
+	d_allocate_strmat(nx_+1, nx_, &sQqN);
+	d_cvt_mat2strmat(nx_, nx_, Q, nx_, &sQqN, 0, 0);
+	drowin_libstr(nx_, 1.0, &sqN, 0, &sQqN, nx_, 0);
+	d_print_strmat(nx_+1, nx_, &sQqN, 0, 0);
+
+/************************************************
+* array of matrices
+************************************************/	
+	
+	struct d_strmat hsBAbt[N];
+	struct d_strvec hsb[N];
+	struct d_strmat hsRSQrq[N+1];
+	struct d_strvec hsrq[N+1];
+	struct d_strmat hsL[N+1];
+	struct d_strmat hsLxt[N+1];
+	struct d_strvec hsPb[N];
+	struct d_strvec hsux[N+1];
+	struct d_strvec hspi[N];
+	struct d_strmat hswork_mat[1];
+	struct d_strvec hswork_vec[1];
+
+	hsBAbt[0] = sBbt0;
+	hsb[0] = sb0;
+	hsRSQrq[0] = sRr0;
+	hsrq[0] = sr0;
+	d_allocate_strmat(nu_+1, nu_, &hsL[0]);
+//	d_allocate_strmat(nu_+1, nu_, &hsLxt[0]);
+	d_allocate_strvec(nx_, &hsPb[0]);
+	d_allocate_strvec(nx_+nu_+1, &hsux[0]);
+	d_allocate_strvec(nx_, &hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		hsBAbt[ii] = sBAbt1;
+		hsb[ii] = sb;
+		hsRSQrq[ii] = sRSQrq1;
+		hsrq[ii] = srq1;
+		d_allocate_strmat(nu_+nx_+1, nu_+nx_, &hsL[ii]);
+		d_allocate_strmat(nx_, nu_+nx_, &hsLxt[ii]);
+		d_allocate_strvec(nx_, &hsPb[ii]);
+		d_allocate_strvec(nx_+nu_+1, &hsux[ii]);
+		d_allocate_strvec(nx_, &hspi[ii]);
+		}
+	hsRSQrq[N] = sQqN;
+	hsrq[N] = sqN;
+	d_allocate_strmat(nx_+1, nx_, &hsL[N]);
+	d_allocate_strmat(nx_, nx_, &hsLxt[N]);
+	d_allocate_strvec(nx_+nu_+1, &hsux[N]);
+	d_allocate_strmat(nu_+nx_+1, nx_, &hswork_mat[0]);
+	d_allocate_strvec(nx_, &hswork_vec[0]);
+
+//	for(ii=0; ii<N; ii++)
+//		d_print_strmat(nu[ii]+nx[ii]+1, nx[ii+1], &hsBAbt[ii], 0, 0);
+//	return 0;
+
+/************************************************
+* call Riccati solver
+************************************************/	
+	
+	// timing 
+	struct timeval tv0, tv1, tv2, tv3;
+	int nrep = 1000;
+	int rep;
+
+	gettimeofday(&tv0, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+//		d_back_ric_sv_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hsux, hspi, hswork_mat, hswork_vec);
+		}
+
+	gettimeofday(&tv1, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		d_back_ric_trf_libstr(N, nx, nu, hsBAbt, hsRSQrq, hsL, hsLxt, hswork_mat);
+		}
+
+	gettimeofday(&tv2, NULL); // time
+
+	for(rep=0; rep<nrep; rep++)
+		{
+		d_back_ric_trs_libstr(N, nx, nu, hsBAbt, hsb, hsrq, hsL, hsLxt, hsPb, hsux, hspi, hswork_vec);
+		}
+
+	gettimeofday(&tv3, NULL); // time
+
+	float time_sv  = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+	float time_trf = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+	float time_trs = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+	// print sol
+	printf("\nux = \n\n");
+	for(ii=0; ii<=N; ii++)
+		d_print_tran_strvec(nu[ii]+nx[ii], &hsux[ii], 0);
+
+	printf("\npi = \n\n");
+	for(ii=0; ii<N; ii++)
+		d_print_tran_strvec(nx[ii+1], &hspi[ii], 0);
+
+	printf("\ntime sv\t\ttime trf\t\ttime trs\n");
+	printf("\n%e\t%e\t%e\n", time_sv, time_trf, time_trs);
+	printf("\n");
+
+/************************************************
+* free memory
+************************************************/	
+
+	d_free(A);
+	d_free(B);
+	d_free(b);
+	d_free_align(x0);
+	d_free(R);
+	d_free(S);
+	d_free(Q);
+	d_free(r);
+	d_free(q);
+	d_free(b0);
+	d_free_strmat(&sA);
+	d_free_strvec(&sb);
+	d_free_strmat(&sBbt0);
+	d_free_strvec(&sb0);
+	d_free_strmat(&sBAbt1);
+	d_free_strmat(&sRr0);
+	d_free_strvec(&sr0);
+	d_free_strmat(&sRSQrq1);
+	d_free_strvec(&srq1);
+	d_free_strmat(&sQqN);
+	d_free_strvec(&sqN);
+	d_free_strmat(&hsL[0]);
+//	d_free_strmat(&hsLxt[0]);
+	d_free_strvec(&hsPb[0]);
+	d_free_strvec(&hsux[0]);
+	d_free_strvec(&hspi[0]);
+	for(ii=1; ii<N; ii++)
+		{
+		d_free_strmat(&hsL[ii]);
+		d_free_strmat(&hsLxt[ii]);
+		d_free_strvec(&hsPb[ii]);
+		d_free_strvec(&hsux[ii]);
+		d_free_strvec(&hspi[ii]);
+		}
+	d_free_strmat(&hsL[N]);
+	d_free_strmat(&hsLxt[N]);
+	d_free_strvec(&hsux[N]);
+	d_free_strmat(&hswork_mat[0]);
+	d_free_strvec(&hswork_vec[0]);
+
+
+/************************************************
+* return
+************************************************/	
+
+	return 0;
+
+	}
+
+
+
diff --git a/examples/tools.c b/examples/tools.c
new file mode 100644
index 0000000..51d9e95
--- /dev/null
+++ b/examples/tools.c
@@ -0,0 +1,724 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of HPMPC.                                                                     *
+*                                                                                                 *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC.                        *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.                *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+//#include "../include/aux_d.h"
+
+//void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *A, int *lda, double *B, int *ldb, double *beta, double *C, int *ldc);
+//void dgesv_(int *n, int *nrhs, double *A, int *lda, int *ipiv, double *B, int *ldb, int *info);
+//void dcopy_(int *n, double *dx, int *incx, double *dy, int *incy);
+//void daxpy_(int *n, double *da, double *dx, int *incx, double *dy, int *incy);
+//void dscal_(int *n, double *da, double *dx, int *incx);
+
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+
+
+/************************************************
+ matrix-matrix multiplication
+************************************************/
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc)
+	{
+	
+	int ii, jj, kk;
+	
+	for(jj=0; jj<n; jj++)
+		{
+		for(ii=0; ii<m; ii++)
+			{
+			C[ii+ldc*jj] = 0;
+			for(kk=0; kk<k; kk++)
+				{
+				C[ii+ldc*jj] += A[ii+lda*kk] * B[kk+ldb*jj];
+				}
+			}
+		}
+	
+	return;
+	
+	}
+
+
+void daxpy_3l(int n, double da, double *dx, double *dy)
+	{
+	int i;
+	for(i=0; i<n; i++)
+		{
+		dy[i] += da*dx[i];
+		}
+	}
+
+
+
+void dscal_3l(int n, double da, double *dx)
+	{
+	int i;
+	for(i=0; i<n; i++)
+		{
+		dx[i] *= da;
+		}
+	}
+
+
+
+/************************************************
+ Routine that copies a matrix 
+************************************************/
+void dmcopy(int row, int col, double *A, int lda, double *B, int ldb)
+	{
+	int i, j;
+	for(j=0; j<col; j++)
+		{
+		for(i=0; i<row; i++)
+			{
+			B[i+j*ldb] = A[i+j*lda];
+			}
+		}
+	}
+
+
+
+int idamax_3l(int n, double *x)
+	{
+	
+	if(n<=0)
+		return 0;
+	if(n==1)
+		return 0;	
+
+	double dabs;
+	double dmax = (x[0]>0 ? x[0] : -x[0]);
+	int idmax = 0;
+	int jj;
+	for(jj=1; jj<n; jj++)
+		{
+		dabs = (x[jj]>0 ? x[jj] : -x[jj]);
+		if(dabs>dmax)
+			{
+			dmax = dabs;
+			idmax = jj;
+			}
+		}
+	
+	return idmax;
+
+	}
+
+
+
+void dswap_3l(int n, double *x, int incx, double *y, int incy)
+	{
+	
+	if(n<=0)
+		return;
+	
+	double temp;
+	int jj;
+	for(jj=0; jj<n; jj++)
+		{
+		temp = x[0];
+		x[0] = y[0];
+		y[0] = temp;
+		x += incx;
+		y += incy;
+		}
+	
+	}
+
+
+
+void dger_3l(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *A, int lda)
+	{
+	
+	if(m==0 || n==0 || alpha==0.0)
+		return;
+	
+	int i, j;
+	double *px, *py, temp;
+	
+	py = y;
+	for(j=0; j<n; j++)
+		{
+		temp = alpha * py[0];
+		px = x;
+		for(i=0; i<m; i++)
+			{
+			A[i+lda*j] += px[0] * temp;
+			px += incx;
+			}
+		py += incy;
+		}
+	
+	return;
+	
+	}
+
+
+
+void dgetf2_3l(int m, int n, double *A, int lda, int *ipiv, int *info)
+	{
+	
+	if(m<=0 || n<=0)
+		return;
+	
+	int i, j, jp;
+	
+	double Ajj;
+	
+	int size_min = ( m<n ? m : n );
+	
+	for(j=0; j<size_min; j++)
+		// find the pivot and test for singularity
+		{
+		jp = j + idamax_3l(m-j, &A[j+lda*j]);
+		ipiv[j] = jp;
+		if( A[jp+lda*j]!=0)
+			{
+			// apply the interchange to columns 0:n-1
+			if(jp!=j)
+				{
+				dswap_3l(n, &A[j], lda, &A[jp], lda);
+				}
+			// compute elements j+1:m-1 of j-th column
+			if(j<m-1)
+				{
+				Ajj = A[j+lda*j];
+				if( ( Ajj>0 ? Ajj : -Ajj ) >= 2.22e-16 )
+					{
+					dscal_3l(m-j-1, 1.0/Ajj, &A[j+1+lda*j]);
+					}
+				else
+					{
+					for(i=j+1; i<m; i++)
+						{
+						A[i+lda*j] /= Ajj;
+						}
+					}
+				}
+			}
+		else if(*info==0)
+			{
+			*info = j+1;
+			}
+		
+		if( j < size_min )
+			{
+			// update trailing submatrix
+			dger_3l(m-j-1, n-j-1, -1.0, &A[j+1+lda*j], 1, &A[j+lda*(j+1)], lda, &A[j+1+lda*(j+1)], lda);
+			}
+		
+		}
+
+	return;	
+	
+	}
+
+
+
+void dlaswp_3l(int n, double *A, int lda, int k1, int k2, int *ipiv)
+	{
+	
+	int i, j, k, ix, ix0, i1, i2, n32, ip;
+	double temp;
+
+	ix0 = k1;
+	i1 = k1;
+	i2 = k2;
+	
+	n32 = (n/32)*32;
+	if(n32!=0)
+		{
+		for(j=0; j<n32; j+=32)
+			{
+			ix = ix0;
+			for(i=i1; i<i2; i++)
+				{
+				ip = ipiv[ix];
+				if(ip!=i)
+					{
+					for(k=j; k<j+32; k++)
+						{
+						temp = A[i+lda*k];
+						A[i+lda*k] = A[ip+lda*k];
+						A[ip+lda*k] = temp;
+						}
+					}
+				ix++;
+				}
+			}
+		}
+	if(n32!=n)
+		{
+		ix = ix0;
+		for(i=i1; i<i2; i++)
+			{
+			ip = ipiv[ix];
+			if(ip!=i)
+				{
+				for(k=n32; k<n; k++)
+					{
+					temp = A[i+lda*k];
+					A[i+lda*k] = A[ip+lda*k];
+					A[ip+lda*k] = temp;
+					}
+				}
+			ix++;
+			}
+		}
+
+	return;
+	
+	}
+
+
+
+// left lower no-transp unit
+void dtrsm_l_l_n_u_3l(int m, int n, double *A, int lda, double *B, int ldb)
+	{
+	
+	if(m==0 || n==0)
+		return;
+	
+	int i, j, k;
+	
+	for(j=0; j<n; j++)
+		{
+		for(k=0; k<m; k++)
+			{
+			for(i=k+1; i<m; i++)
+				{
+				B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+				}
+			}
+		}
+	
+	return;
+	
+	}
+
+
+
+// left upper no-transp non-unit
+void dtrsm_l_u_n_n_3l(int m, int n, double *A, int lda, double *B, int ldb)
+	{
+	
+	if(m==0 || n==0)
+		return;
+	
+	int i, j, k;
+	
+	for(j=0; j<n; j++)
+		{
+		for(k=m-1; k>=0; k--)
+			{
+			B[k+ldb*j] /= A[k+lda*k];
+			for(i=0; i<k; i++)
+				{
+				B[i+ldb*j] -= B[k+ldb*j] * A[i+lda*k];
+				}
+			}
+		}
+
+	return;
+	
+	}
+
+
+
+void dgetrs_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+	{
+	
+	if(n==0 || nrhs==0)
+		return;
+	
+	// solve A * X = B
+
+	// apply row interchanges to the rhs
+	dlaswp_3l(nrhs, B, ldb, 0, n, ipiv);
+
+	// solve L*X = B, overwriting B with X
+	dtrsm_l_l_n_u_3l(n, nrhs, A, lda, B, ldb);
+
+	// solve U*X = B, overwriting B with X
+	dtrsm_l_u_n_n_3l(n, nrhs, A, lda, B, ldb);
+
+	return;
+	  	
+	}
+
+
+
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info)
+	{
+	
+	// compute the LU factorization of A
+	dgetf2_3l(n, n, A, lda, ipiv, info);
+	
+	if(*info==0)
+		{
+		// solve the system A*X = B, overwriting B with X
+		dgetrs_3l(n, nrhs, A, lda, ipiv, B, ldb, info);
+		}
+
+	return;
+	
+	}
+
+
+
+/* one norm of a matrix */
+double onenorm(int row, int col, double *ptrA)
+	{
+	double max, temp;
+	int i, j;
+	temp = 0;
+	for(j=0; j<col; j++)
+		{
+		temp = abs(*(ptrA+j*row));
+		for(i=1; i<row; i++)
+			{
+			temp += abs(*(ptrA+j*row+i));
+			}
+		if(j==0) max = temp;
+		else if(max>temp) temp = max;
+		}
+	return temp;
+	}
+
+
+
+/* computes the Pade approximation of degree m of the matrix A */
+void padeapprox(int m, int row, double *A)
+	{
+	int ii;
+	int row2 = row*row;
+/*	int i1 = 1;*/
+/*	double d0 = 0;*/
+/*	double d1 = 1;*/
+/*	double dm1 = -1;*/
+	
+	double *U = (double *) malloc(row*row*sizeof(double)); // d_zeros(&U, row, row); 
+	double *V = (double *) malloc(row*row*sizeof(double)); // d_zeros(&V, row, row);
+	
+	if(m==3)
+		{
+		double c[] = {120, 60, 12, 1};
+		double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+		for(ii=0; ii<row*row; ii++)
+			A0[ii] = 0.0;
+		for(ii=0; ii<row; ii++)
+			A0[ii*(row+1)] = 1.0;
+		double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 0;
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+		dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+//		dscal_(&row2, &d0, temp, &i1);
+		dscal_3l(row2, 0, temp);
+//		daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+		daxpy_3l(row2, c[3], A2, temp);
+//		daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+		daxpy_3l(row2, c[1], A0, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+		dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+//		dscal_(&row2, &d0, V, &i1);
+		dscal_3l(row2, 0, V);
+//		daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+		daxpy_3l(row2, c[2], A2, V);
+//		daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+		daxpy_3l(row2, c[0], A0, V);
+		free(A0);
+		free(A2);
+		free(temp);
+		}
+	else if(m==5)
+		{
+		double c[] = {30240, 15120, 3360, 420, 30, 1};
+		double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+		for(ii=0; ii<row*row; ii++)
+			A0[ii] = 0.0;
+		for(ii=0; ii<row; ii++)
+			A0[ii*(row+1)] = 1.0;
+		double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 0;
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+		dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+		dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+		dmcopy(row, row, A4, row, V, row);
+		dmcopy(row, row, A4, row, temp, row);
+//		daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+		daxpy_3l(row2, c[3], A2, temp);
+//		daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+		daxpy_3l(row2, c[1], A0, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+		dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+//		dscal_(&row2, &c[4], V, &i1);
+		dscal_3l(row2, c[4], V);
+//		daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+		daxpy_3l(row2, c[2], A2, V);
+//		daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+		daxpy_3l(row2, c[0], A0, V);
+		free(A0);
+		free(A2);
+		free(A4);
+		free(temp);
+		}
+	else if(m==7)
+		{
+		double c[] = {17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1};
+		double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+		for(ii=0; ii<row*row; ii++)
+			A0[ii] = 0.0;
+		for(ii=0; ii<row; ii++)
+			A0[ii*(row+1)] = 1.0;
+		double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 1;
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+		dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+		dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+		dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+//		dscal_(&row2, &d0, temp, &i1);
+		dscal_3l(row2, 0, temp);
+//		daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+		daxpy_3l(row2, c[3], A2, temp);
+//		daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+		daxpy_3l(row2, c[1], A0, temp);
+//		daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+		daxpy_3l(row2, c[5], A4, temp);
+//		daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+		daxpy_3l(row2, c[7], A6, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+		dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+//		dscal_(&row2, &d0, V, &i1);
+		dscal_3l(row2, 0, V);
+//		daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+		daxpy_3l(row2, c[2], A2, V);
+//		daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+		daxpy_3l(row2, c[0], A0, V);
+//		daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+		daxpy_3l(row2, c[4], A4, V);
+//		daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+		daxpy_3l(row2, c[6], A6, V);
+		free(A0);
+		free(A2);
+		free(A4);
+		free(A6);
+		free(temp);
+		}
+	else if(m==9)
+		{
+		double c[] = {17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1};		
+		double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+		for(ii=0; ii<row*row; ii++)
+			A0[ii] = 0.0;
+		for(ii=0; ii<row; ii++)
+			A0[ii*(row+1)] = 1.0;
+		double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A8 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 0;
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+		dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+		dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+		dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, A2, &row, &beta, A8, &row);
+		dgemm_nn_3l(row, row, row, A6, row, A2, row, A8, row);
+		dmcopy(row, row, A8, row, V, row);
+		dmcopy(row, row, A8, row, temp, row);
+//		daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+		daxpy_3l(row2, c[3], A2, temp);
+//		daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+		daxpy_3l(row2, c[1], A0, temp);
+//		daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+		daxpy_3l(row2, c[5], A4, temp);
+//		daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+		daxpy_3l(row2, c[7], A6, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+		dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+//		dscal_(&row2, &c[8], V, &i1);
+		dscal_3l(row2, c[8], V);
+//		daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+		daxpy_3l(row2, c[2], A2, V);
+//		daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+		daxpy_3l(row2, c[0], A0, V);
+//		daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+		daxpy_3l(row2, c[4], A4, V);
+//		daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+		daxpy_3l(row2, c[6], A6, V);
+		free(A0);
+		free(A2);
+		free(A4);
+		free(A6);
+		free(A8);
+		free(temp);
+		}
+	else if(m==13) // tested
+		{
+		double c[] = {64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800, 129060195264000, 10559470521600, 670442572800, 33522128640, 1323241920, 40840800, 960960, 16380, 182, 1};
+		double *A0 = (double *) malloc(row*row*sizeof(double)); // d_eye(&A0, row);
+		for(ii=0; ii<row*row; ii++)
+			A0[ii] = 0.0;
+		for(ii=0; ii<row; ii++)
+			A0[ii*(row+1)] = 1.0;
+		double *A2 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A4 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *A6 = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+		double *temp = malloc(row*row*sizeof(double)); // d_zeros(&temp, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 0;
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, A2, &row);
+		dgemm_nn_3l(row, row, row, A, row, A, row, A2, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A2, &row, A2, &row, &beta, A4, &row);
+		dgemm_nn_3l(row, row, row, A2, row, A2, row, A4, row);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A4, &row, A2, &row, &beta, A6, &row);
+		dgemm_nn_3l(row, row, row, A4, row, A2, row, A6, row);
+		dmcopy(row, row, A2, row, U, row);
+//		dscal_(&row2, &c[9], U, &i1);
+		dscal_3l(row2, c[9], U);
+//		daxpy_(&row2, &c[11], A4, &i1, U, &i1);
+		daxpy_3l(row2, c[11], A4, U);
+//		daxpy_(&row2, &c[13], A6, &i1, U, &i1);
+		daxpy_3l(row2, c[13], A6, U);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, U, &row, &beta, temp, &row);
+		dgemm_nn_3l(row, row, row, A6, row, U, row, temp, row);
+//		daxpy_(&row2, &c[7], A6, &i1, temp, &i1);
+		daxpy_3l(row2, c[7], A6, temp);
+//		daxpy_(&row2, &c[5], A4, &i1, temp, &i1);
+		daxpy_3l(row2, c[5], A4, temp);
+//		daxpy_(&row2, &c[3], A2, &i1, temp, &i1);
+		daxpy_3l(row2, c[3], A2, temp);
+//		daxpy_(&row2, &c[1], A0, &i1, temp, &i1);
+		daxpy_3l(row2, c[1], A0, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, temp, &row, &beta, U, &row);
+		dgemm_nn_3l(row, row, row, A, row, temp, row, U, row);
+		dmcopy(row, row, A2, row, temp, row);
+//		dscal_(&row2, &c[8], V, &i1);
+		dscal_3l(row2, c[8], V);
+//		daxpy_(&row2, &c[12], A6, &i1, temp, &i1);
+		daxpy_3l(row2, c[12], A6, temp);
+//		daxpy_(&row2, &c[10], A4, &i1, temp, &i1);
+		daxpy_3l(row2, c[10], A4, temp);
+//		dgemm_(&ta, &ta, &row, &row, &row, &alpha, A6, &row, temp, &row, &beta, V, &row);
+		dgemm_nn_3l(row, row, row, A6, row, temp, row, V, row);
+//		daxpy_(&row2, &c[6], A6, &i1, V, &i1);
+		daxpy_3l(row2, c[6], A6, V);
+//		daxpy_(&row2, &c[4], A4, &i1, V, &i1);
+		daxpy_3l(row2, c[4], A4, V);
+//		daxpy_(&row2, &c[2], A2, &i1, V, &i1);
+		daxpy_3l(row2, c[2], A2, V);
+//		daxpy_(&row2, &c[0], A0, &i1, V, &i1);
+		daxpy_3l(row2, c[0], A0, V);
+		free(A0);
+		free(A2);
+		free(A4);
+		free(A6);
+		free(temp);
+		}
+	else
+		{
+		printf("%s\n", "Wrong Pade approximatin degree");
+		exit(1);
+		}
+	double *D = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+//	dcopy_(&row2, V, &i1, A, &i1);
+	dmcopy(row, row, V, row, A, row);
+//	daxpy_(&row2, &d1, U, &i1, A, &i1);
+	daxpy_3l(row2, 1.0, U, A);
+//	dcopy_(&row2, V, &i1, D, &i1);
+	dmcopy(row, row, V, row, D, row);
+//	daxpy_(&row2, &dm1, U, &i1, D, &i1);
+	daxpy_3l(row2, -1.0, U, D);
+	int *ipiv = (int *) malloc(row*sizeof(int));
+	int info = 0;
+//	dgesv_(&row, &row, D, &row, ipiv, A, &row, &info);
+	dgesv_3l(row, row, D, row, ipiv, A, row, &info);
+	free(ipiv);
+	free(D);
+	free(U);
+	free(V);
+	}	
+
+
+
+void expm(int row, double *A)
+	{
+	
+	int i;
+	
+	int m_vals[] = {3, 5, 7, 9, 13};
+	double theta[] = {0.01495585217958292, 0.2539398330063230, 0.9504178996162932, 2.097847961257068, 5.371920351148152};
+	int lentheta = 5;
+	
+	double normA = onenorm(row, row, A);
+
+	if(normA<=theta[4])
+		{
+		for(i=0; i<lentheta; i++)
+			{
+			if(normA<=theta[i])
+				{
+				padeapprox(m_vals[i], row, A);
+				break;
+				}
+			}
+		}
+	else
+		{
+		int s;
+		double t = frexp(normA/(theta[4]), &s);
+		s = s - (t==0.5);
+		t = pow(2,-s);
+		int row2 = row*row;
+/*		int i1 = 1;*/
+//		dscal_(&row2, &t, A, &i1);
+		dscal_3l(row2, t, A);
+		padeapprox(m_vals[4], row, A);
+		double *temp = (double *) malloc(row*row*sizeof(double)); // d_zeros(&A2, row, row);
+//		char ta = 'n'; double alpha = 1; double beta = 0;
+		for(i=0; i<s; i++)
+			{
+//			dgemm_(&ta, &ta, &row, &row, &row, &alpha, A, &row, A, &row, &beta, temp, &row);
+			dgemm_nn_3l(row, row, row, A, row, A, row, temp, row);
+			dmcopy(row, row, temp, row, A, row);
+			}
+		free(temp);
+		}
+	}
+
+
diff --git a/examples/tools.h b/examples/tools.h
new file mode 100644
index 0000000..b017301
--- /dev/null
+++ b/examples/tools.h
@@ -0,0 +1,37 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of HPMPC.                                                                     *
+*                                                                                                 *
+* HPMPC -- Library for High-Performance implementation of solvers for MPC.                        *
+* Copyright (C) 2014-2015 by Technical University of Denmark. All rights reserved.                *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                                                                                                 *
+**************************************************************************************************/
+
+void dgemm_nn_3l(int m, int n, int k, double *A, int lda , double *B, int ldb, double *C, int ldc);
+void daxpy_3l(int n, double da, double *dx, double *dy);
+void dscal_3l(int n, double da, double *dx);
+
+/* copies a matrix into another matrix */
+void dmcopy(int row, int col, double *ptrA, int lda, double *ptrB, int ldb);
+
+/* solution of a system of linear equations */
+void dgesv_3l(int n, int nrhs, double *A, int lda, int *ipiv, double *B, int ldb, int *info);
+
+/* matrix exponential */
+void expm(int row, double *A);
diff --git a/include/blasfeo_block_size.h b/include/blasfeo_block_size.h
new file mode 100644
index 0000000..9b74139
--- /dev/null
+++ b/include/blasfeo_block_size.h
@@ -0,0 +1,88 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifndef BLASFEO_BLOCK_SIZE
+#define BLASFEO_BLOCK_SIZE
+
+
+
+#if defined( TARGET_X64_INTEL_HASWELL )
+
+#define D_PS 4
+#define S_PS 8
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_INTEL_SANDY_BRIDGE )
+
+#define D_PS 4
+#define S_PS 8
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_INTEL_CORE )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_X64_AMD_BULLDOZER )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_ARMV8A_ARM_CORTEX_A57 )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4
+#define S_NC 4
+
+#elif defined( TARGET_ARMV7A_ARM_CORTEX_A15 )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#elif defined( TARGET_GENERIC )
+
+#define D_PS 4
+#define S_PS 4
+#define D_NC 4 // 2 // until the smaller kernel is 4x4
+#define S_NC 4 //2
+
+#else
+#error "Unknown architecture"
+#endif
+
+
+#endif  // BLASFEO_BLOCK_SIZE
diff --git a/include/blasfeo_common.h b/include/blasfeo_common.h
new file mode 100644
index 0000000..3f95c91
--- /dev/null
+++ b/include/blasfeo_common.h
@@ -0,0 +1,146 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifndef BLASFEO_COMMON
+#define BLASFEO_COMMON
+
+
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+#include "blasfeo_block_size.h"
+
+// matrix structure
+struct d_strmat
+	{
+	int m; // rows
+	int n; // cols
+	int pm; // packed number or rows
+	int cn; // packed number or cols
+	double *pA; // pointer to a pm*pn array of doubles, the first is aligned to cache line size
+	double *dA; // pointer to a min(m,n) (or max???) array of doubles
+	int use_dA; // flag to tell if dA can be used
+	int memory_size; // size of needed memory
+	};
+
+struct s_strmat
+	{
+	int m; // rows
+	int n; // cols
+	int pm; // packed number or rows
+	int cn; // packed number or cols
+	float *pA; // pointer to a pm*pn array of floats, the first is aligned to cache line size
+	float *dA; // pointer to a min(m,n) (or max???) array of floats
+	int use_dA; // flag to tell if dA can be used
+	int memory_size; // size of needed memory
+	};
+
+// vector structure
+struct d_strvec
+	{
+	int m; // size
+	int pm; // packed size
+	double *pa; // pointer to a pm array of doubles, the first is aligned to cache line size
+	int memory_size; // size of needed memory
+	};
+
+struct s_strvec
+	{
+	int m; // size
+	int pm; // packed size
+	float *pa; // pointer to a pm array of floats, the first is aligned to cache line size
+	int memory_size; // size of needed memory
+	};
+
+#define DMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[((ai)-((ai)&(D_PS-1)))*(sA)->cn+(aj)*D_PS+((ai)&(D_PS-1))])
+#define SMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[((ai)-((ai)&(S_PS-1)))*(sA)->cn+(aj)*S_PS+((ai)&(S_PS-1))])
+#define DVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+#define SVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+
+#elif defined(LA_BLAS) | defined(LA_REFERENCE)
+
+// matrix structure
+struct d_strmat
+	{
+	int m; // rows
+	int n; // cols
+	double *pA; // pointer to a m*n array of doubles
+	double *dA; // pointer to a min(m,n) (or max???) array of doubles
+	int use_dA; // flag to tell if dA can be used
+	int memory_size; // size of needed memory
+	};
+
+struct s_strmat
+	{
+	int m; // rows
+	int n; // cols
+	float *pA; // pointer to a m*n array of floats
+	float *dA; // pointer to a min(m,n) (or max???) array of floats
+	int use_dA; // flag to tell if dA can be used
+	int memory_size; // size of needed memory
+	};
+
+// vector structure
+struct d_strvec
+	{
+	int m; // size
+	double *pa; // pointer to a m array of doubles, the first is aligned to cache line size
+	int memory_size; // size of needed memory
+	};
+
+struct s_strvec
+	{
+	int m; // size
+	float *pa; // pointer to a m array of floats, the first is aligned to cache line size
+	int memory_size; // size of needed memory
+	};
+
+#define DMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[(ai)+(aj)*(sA)->m])
+#define SMATEL_LIBSTR(sA,ai,aj) ((sA)->pA[(ai)+(aj)*(sA)->m])
+#define DVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+#define SVECEL_LIBSTR(sa,ai) ((sa)->pa[ai])
+
+#else
+
+#error : wrong LA choice
+
+#endif
+
+#endif  // BLASFEO_COMMON
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_aux.h b/include/blasfeo_d_aux.h
new file mode 100644
index 0000000..c4f71ee
--- /dev/null
+++ b/include/blasfeo_d_aux.h
@@ -0,0 +1,138 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_lib.c
+************************************************/
+
+// returns the memory size (in bytes) needed for a strmat
+int d_size_strmat(int m, int n);
+// returns the memory size (in bytes) needed for the diagonal of a strmat
+int d_size_diag_strmat(int m, int n);
+// returns the memory size (in bytes) needed for a strvec
+int d_size_strvec(int m);
+// create a strmat for a matrix of size m*n by using memory passed by a pointer (pointer is not updated)
+void d_create_strmat(int m, int n, struct d_strmat *sA, void *memory);
+// create a strvec for a vector of size m by using memory passed by a pointer (pointer is not updated)
+void d_create_strvec(int m, struct d_strvec *sA, void *memory);
+void d_cvt_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj);
+void d_cvt_vec2strvec(int m, double *a, struct d_strvec *sa, int ai);
+void d_cvt_tran_mat2strmat(int m, int n, double *A, int lda, struct d_strmat *sA, int ai, int aj);
+void d_cvt_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda);
+void d_cvt_strvec2vec(int m, struct d_strvec *sa, int ai, double *a);
+void d_cvt_tran_strmat2mat(int m, int n, struct d_strmat *sA, int ai, int aj, double *A, int lda);
+void d_cast_mat2strmat(double *A, struct d_strmat *sA);
+void d_cast_diag_mat2strmat(double *dA, struct d_strmat *sA);
+void d_cast_vec2vecmat(double *a, struct d_strvec *sa);
+void dgein1_libstr(double a, struct d_strmat *sA, int ai, int aj);
+double dgeex1_libstr(struct d_strmat *sA, int ai, int aj);
+void dvecin1_libstr(double a, struct d_strvec *sx, int xi);
+double dvecex1_libstr(struct d_strvec *sx, int xi);
+// A <= alpha
+void dgese_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj);
+// a <= alpha
+void dvecse_libstr(int m, double alpha, struct d_strvec *sx, int xi);
+void dgecp_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dgecp_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dgesc_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj);
+void dveccp_libstr(int m, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci);
+void dvecsc_libstr(int m, double alpha, struct d_strvec *sa, int ai);
+void dtrcp_l_lib(int m, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dtrcp_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dgead_lib(int m, int n, double alpha, int offsetA, double *A, int sda, int offsetB, double *B, int sdb);
+void dgead_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dvecad_libstr(int m, double alpha, struct d_strvec *sa, int ai, struct d_strvec *sc, int ci);
+void dgetr_lib(int m, int n, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dgetr_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dtrtr_l_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dtrtr_l_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dtrtr_u_lib(int m, double alpha, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dtrtr_u_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void ddiareg_lib(int kmax, double reg, int offset, double *pD, int sdd);
+void ddiare_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj);
+void ddiain_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void ddiain_sqrt_lib(int kmax, double *x, int offset, double *pD, int sdd);
+void ddiaex_lib(int kmax, double alpha, int offset, double *pD, int sdd, double *x);
+void ddiaad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd);
+void ddiain_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd);
+void ddiain_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void ddiaex_libsp(int kmax, int *idx, double alpha, double *pD, int sdd, double *x);
+void ddiaex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void ddiaex_sp_libstr(int kmax, double alpha, int *idx, struct d_strmat *sD, int di, int dj, struct d_strvec *sx, int xi);
+void ddiaad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void ddiaad_libsp(int kmax, int *idx, double alpha, double *x, double *pD, int sdd);
+void ddiaad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void ddiaadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD, int sdd);
+void ddiaadin_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, int *idx, struct d_strmat *sD, int di, int dj);
+void drowin_lib(int kmax, double alpha, double *x, double *pD);
+void drowin_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void drowex_lib(int kmax, double alpha, double *pD, double *x);
+void drowex_libstr(int kmax, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void drowad_lib(int kmax, double alpha, double *x, double *pD);
+void drowad_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void drowin_libsp(int kmax, double alpha, int *idx, double *x, double *pD);
+void drowad_libsp(int kmax, int *idx, double alpha, double *x, double *pD);
+void drowad_sp_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strmat *sD, int di, int dj);
+void drowadin_libsp(int kmax, int *idx, double alpha, double *x, double *y, double *pD);
+void drowsw_lib(int kmax, double *pA, double *pC);
+void drowsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void drowpe_libstr(int kmax, int *ipiv, struct d_strmat *sA);
+void dcolex_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi);
+void dcolin_lib(int kmax, double *x, int offset, double *pD, int sdd);
+void dcolin_libstr(int kmax, struct d_strvec *sx, int xi, struct d_strmat *sA, int ai, int aj);
+void dcolad_lib(int kmax, double alpha, double *x, int offset, double *pD, int sdd);
+void dcolin_libsp(int kmax, int *idx, double *x, double *pD, int sdd);
+void dcolad_libsp(int kmax, double alpha, int *idx, double *x, double *pD, int sdd);
+void dcolsw_lib(int kmax, int offsetA, double *pA, int sda, int offsetC, double *pC, int sdc);
+void dcolsw_libstr(int kmax, struct d_strmat *sA, int ai, int aj, struct d_strmat *sC, int ci, int cj);
+void dcolpe_libstr(int kmax, int *ipiv, struct d_strmat *sA);
+void dvecin_libsp(int kmax, int *idx, double *x, double *y);
+void dvecad_libsp(int kmax, int *idx, double alpha, double *x, double *y);
+void dvecad_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi);
+void dvecin_sp_libstr(int m, double alpha, struct d_strvec *sx, int xi, int *idx, struct d_strvec *sz, int zi);
+void dvecex_sp_libstr(int m, double alpha, int *idx, struct d_strvec *sx, int x, struct d_strvec *sz, int zi);
+void dveccl_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi);
+void dveccl_mask_libstr(int m, struct d_strvec *sxm, int xim, struct d_strvec *sx, int xi, struct d_strvec *sxp, int xip, struct d_strvec *sz, int zi, struct d_strvec *sm, int mi);
+void dvecze_libstr(int m, struct d_strvec *sm, int mi, struct d_strvec *sv, int vi, struct d_strvec *se, int ei);
+void dvecnrm_inf_libstr(int m, struct d_strvec *sx, int xi, double *ptr_norm);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_aux_ext_dep.h b/include/blasfeo_d_aux_ext_dep.h
new file mode 100644
index 0000000..7b0222b
--- /dev/null
+++ b/include/blasfeo_d_aux_ext_dep.h
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+/* column-major matrices */
+
+// dynamically allocate row*col doubles of memory and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros(double **pA, int row, int col);
+// dynamically allocate row*col doubles of memory aligned to 64-byte boundaries and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros_align(double **pA, int row, int col);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to double; set allocated memory to zero
+void d_zeros_align_bytes(double **pA, int size);
+// free the memory allocated by d_zeros
+void d_free(double *pA);
+// free the memory allocated by d_zeros_align or d_zeros_align_bytes
+void d_free_align(double *pA);
+// print a column-major matrix
+void d_print_mat(int m, int n, double *A, int lda);
+// print the transposed of a column-major matrix
+void d_print_tran_mat(int row, int col, double *A, int lda);
+// print to file a column-major matrix
+void d_print_to_file_mat(FILE *file, int row, int col, double *A, int lda);
+// print to file the transposed of a column-major matrix
+void d_print_tran_to_file_mat(FILE *file, int row, int col, double *A, int lda);
+// print in exponential notation a column-major matrix
+void d_print_e_mat(int m, int n, double *A, int lda);
+// print in exponential notation the transposed of a column-major matrix
+void d_print_e_tran_mat(int row, int col, double *A, int lda);
+
+/* strmat and strvec */
+
+#ifdef BLASFEO_COMMON
+// create a strmat for a matrix of size m*n by dynamically allocating memory
+void d_allocate_strmat(int m, int n, struct d_strmat *sA);
+// create a strvec for a vector of size m by dynamically allocating memory
+void d_allocate_strvec(int m, struct d_strvec *sa);
+// free the memory allocated by d_allocate_strmat
+void d_free_strmat(struct d_strmat *sA);
+// free the memory allocated by d_allocate_strvec
+void d_free_strvec(struct d_strvec *sa);
+// print a strmat
+void d_print_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+// print in exponential notation a strmat
+void d_print_e_strmat(int m, int n, struct d_strmat *sA, int ai, int aj);
+// print to file a strmat
+void d_print_to_file_strmat(FILE *file, int m, int n, struct d_strmat *sA, int ai, int aj);
+// print a strvec
+void d_print_strvec(int m, struct d_strvec *sa, int ai);
+// print in exponential notation a strvec
+void d_print_e_strvec(int m, struct d_strvec *sa, int ai);
+// print to file a strvec
+void d_print_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai);
+// print the transposed of a strvec
+void d_print_tran_strvec(int m, struct d_strvec *sa, int ai);
+// print in exponential notation the transposed of a strvec
+void d_print_e_tran_strvec(int m, struct d_strvec *sa, int ai);
+// print to file the transposed of a strvec
+void d_print_tran_to_file_strvec(FILE *file, int m, struct d_strvec *sa, int ai);
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_d_blas.h b/include/blasfeo_d_blas.h
new file mode 100644
index 0000000..a473322
--- /dev/null
+++ b/include/blasfeo_d_blas.h
@@ -0,0 +1,159 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+//
+// level 1 BLAS
+//
+
+// y = y + alpha*x
+void daxpy_libstr(int kmax, double alpha, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// z = x .* y, return sum(z) = x^T * y
+double dvecmuldot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// return x^T * y
+double ddot_libstr(int m, struct d_strvec *sx, int xi, struct d_strvec *sy, int yi);
+
+
+
+//
+// level 2 BLAS
+//
+
+// dense
+
+// z <= beta * y + alpha * A * x
+void dgemv_n_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// z <= beta * y + alpha * A' * x
+void dgemv_t_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(n)
+void dtrsv_lnn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(n)
+void dtrsv_ltn_mn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, not_unit
+void dtrsv_lnn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, unit
+void dtrsv_lnu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, not_unit
+void dtrsv_ltn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, unit
+void dtrsv_ltu_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, not_transposed, not_unit
+void dtrsv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
+void dtrsv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= beta * y + alpha * A * x ; A upper triangular
+void dtrmv_unn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A' * x ; A upper triangular
+void dtrmv_utn_libstr(int m, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void dtrmv_lnn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void dtrmv_ltn_libstr(int m, int n, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, struct d_strvec *sz, int zi);
+// z_n <= beta_n * y_n + alpha_n * A  * x_n
+// z_t <= beta_t * y_t + alpha_t * A' * x_t
+void dgemv_nt_libstr(int m, int n, double alpha_n, double alpha_t, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx_n, int xi_n, struct d_strvec *sx_t, int xi_t, double beta_n, double beta_t, struct d_strvec *sy_n, int yi_n, struct d_strvec *sy_t, int yi_t, struct d_strvec *sz_n, int zi_n, struct d_strvec *sz_t, int zi_t);
+// z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
+void dsymv_l_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+
+// diagonal
+
+// z <= beta * y + alpha * A * x, A diagonal
+void dgemv_diag_libstr(int m, double alpha, struct d_strvec *sA, int ai, struct d_strvec *sx, int xi, double beta, struct d_strvec *sy, int yi, struct d_strvec *sz, int zi);
+
+
+
+//
+// level 3 BLAS
+//
+
+// dense
+
+// D <= beta * C + alpha * A * B^T
+void dgemm_nt_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B
+void dgemm_nn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B^T ; C, D lower triangular
+void dsyrk_ln_libstr(int m, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+void dsyrk_ln_mn_libstr(int m, int n, int k, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^T ; B upper triangular
+void dtrmm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A ; A lower triangular
+void dtrmm_rlnn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular employing explicit inverse of diagonal
+void dtrsm_rltn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular with unit diagonal
+void dtrsm_rltu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_rutn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A lower triangular with unit diagonal
+void dtrsm_llnu_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void dtrsm_lunn_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sD, int di, int dj);
+
+// diagonal
+
+// D <= alpha * A * B + beta * C, with A diagonal (stored as strvec)
+void dgemm_diag_left_ib(int m, int n, double alpha, double *dA, double *pB, int sdb, double beta, double *pC, int sdc, double *pD, int sdd);
+void dgemm_l_diag_libstr(int m, int n, double alpha, struct d_strvec *sA, int ai, struct d_strmat *sB, int bi, int bj, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= alpha * A * B + beta * C, with B diagonal (stored as strvec)
+void dgemm_r_diag_libstr(int m, int n, double alpha, struct d_strmat *sA, int ai, int aj, struct d_strvec *sB, int bi, double beta, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+
+
+
+//
+// LAPACK
+//
+
+// D <= chol( C ) ; C, D lower triangular
+void dpotrf_l_libstr(int m, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+void dpotrf_l_mn_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= chol( C + A * B' ) ; C, D lower triangular
+void dsyrk_dpotrf_ln_libstr(int m, int n, int k, struct d_strmat *sA, int ai, int aj, struct d_strmat *sB, int bi, int bj, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= lu( C ) ; no pivoting
+void dgetrf_nopivot_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj);
+// D <= lu( C ) ; pivoting
+void dgetrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, int *ipiv);
+// D <= qr( C )
+void dgeqrf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work);
+int dgeqrf_work_size_libstr(int m, int n); // in bytes
+// D <= lq( C )
+void dgelqf_libstr(int m, int n, struct d_strmat *sC, int ci, int cj, struct d_strmat *sD, int di, int dj, void *work);
+int dgelqf_work_size_libstr(int m, int n); // in bytes
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_d_kernel.h b/include/blasfeo_d_kernel.h
new file mode 100644
index 0000000..6f045af
--- /dev/null
+++ b/include/blasfeo_d_kernel.h
@@ -0,0 +1,308 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// level 2 BLAS
+// 12
+void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+// 8
+void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+// 4
+void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *C, double *D, int km);
+void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *z, int km);
+void kernel_dgemv_nt_6_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *z);
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+
+
+// level 3 BLAS
+// 12x4
+void kernel_dgemm_nt_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dgemm_nn_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nn_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dsyrk_nt_l_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dsyrk_nt_l_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dtrmm_nt_ru_12x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nn_rl_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+void kernel_dtrmm_nn_rl_12x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_one_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+void kernel_dtrsm_nt_rl_one_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+void kernel_dtrsm_nn_ll_one_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_12x4_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x12
+void kernel_dgemm_nt_4x12_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x12_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nn_4x12_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dtrsm_nt_rl_inv_4x12_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 8x8
+void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [A00 *; A10 A11]
+void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [A00 *; A10 A11]
+void kernel_dsyrk_nt_l_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); // computes [L00 *; A10 L11]
+void kernel_dsyrk_nt_l_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); // computes [L00 *; A10 L11]
+void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 8x4
+void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dsyrk_nt_l_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dtrmm_nt_ru_8x4_vs_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn); //
+void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+void kernel_dtrmm_nn_rl_8x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+void kernel_dtrsm_nt_rl_one_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_8x4_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int kmax, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x8
+void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dtrsm_nt_rl_inv_4x8_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sed, double *inv_diag_E, int km, int kn);
+// 4x4
+void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+// diag
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd);
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D);
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+// low rank update
+void kernel_dger4_sub_12r_lib4(int k, double *A, int sda, double *B, double *C, int sdc);
+void kernel_dger4_sub_12r_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km);
+void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C, int sdc);
+void kernel_dger4_sub_8r_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km);
+void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C);
+void kernel_dger4_sub_4r_vs_lib4(int n, double *A, double *B, double *C, int km);
+
+
+
+// LAPACK
+// 12x4
+void kernel_dpotrf_nt_l_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_l_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_m_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_r_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_l_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_m_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_r_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 8x8
+void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 8x4
+void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_l_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_r_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dgetrf_nn_l_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_nn_r_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 4x4
+void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn); //
+#endif
+void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv);
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv);
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD);
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dlarf_4_lib4(int m, int n, double *pV, int sdv, double *tau, double *pC, int sdc); // rank-4 reflector
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW);
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD);
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT);
+void kernel_dgelqf_dlarft12_12_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_12_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_8_lib4(int n, double *pD, int sdd, double *dD, double *pT);
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT);
+void kernel_dlarfb12_r_4_lib4(int kmax, double *pV, int sdd, double *pT, double *pD, double *pK, int km);
+void kernel_dlarfb4_r_12_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD);
+
+
+
+// merged routines
+// 12x4
+void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+// 4x12
+void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km_, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 8x8
+void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 8x4
+void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+// 4x8
+void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km_, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+// 4x4
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+
+
+// auxiliary routines
+void kernel_dgecp_8_0_lib4(int tri, int kmax, double alpha, double *A0, int sda,  double *B0, int sdb);
+void kernel_dgecp_8_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_8_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_8_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgecp_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgecp_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgecp_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B);
+void kernel_dgead_8_0_lib4(int kmax, double alpha, double *A0, int sda,  double *B0, int sdb);
+void kernel_dgead_8_1_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_8_2_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_8_3_lib4(int kmax, double alpha, double *A0, int sda, double *B0, int sdb);
+void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B);
+void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B);
+void kernel_dgeset_4_lib4(int kmax, double alpha, double *A);
+void kernel_dtrset_4_lib4(int kmax, double alpha, double *A);
+void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A, int sda, double *C, int sdc);
+void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc);
+void kernel_dgetr_4_0_lib4(int m, double *A, int sda, double *B);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_i_aux_ext_dep.h b/include/blasfeo_i_aux_ext_dep.h
new file mode 100644
index 0000000..5f47088
--- /dev/null
+++ b/include/blasfeo_i_aux_ext_dep.h
@@ -0,0 +1,60 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+// i_aux_extern_depend_lib
+void int_zeros(int **pA, int row, int col);
+void int_zeros_align(int **pA, int row, int col);
+void int_free(int *pA);
+void int_free_align(int *pA);
+void int_print_mat(int row, int col, int *A, int lda);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_m_aux.h b/include/blasfeo_m_aux.h
new file mode 100644
index 0000000..bbaac28
--- /dev/null
+++ b/include/blasfeo_m_aux.h
@@ -0,0 +1,45 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+void m_cvt_d2s_strvec(int m, struct d_strvec *vd, int vdi, struct s_strvec *vs, int vsi);
+void m_cvt_s2d_strvec(int m, struct s_strvec *vs, int vsi, struct d_strvec *vd, int vdi);
+void m_cvt_d2s_strmat(int m, int n, struct d_strmat *Md, int mid, int nid, struct s_strmat *Ms, int mis, int nis);
+void m_cvt_s2d_strmat(int m, int n, struct s_strmat *Ms, int mis, int nis, struct d_strmat *Md, int mid, int nid);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_s_aux.h b/include/blasfeo_s_aux.h
new file mode 100644
index 0000000..d93509f
--- /dev/null
+++ b/include/blasfeo_s_aux.h
@@ -0,0 +1,137 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_lib.c
+************************************************/
+
+// returns the memory size (in bytes) needed for a strmat
+int s_size_strmat(int m, int n);
+// returns the memory size (in bytes) needed for the diagonal of a strmat
+int s_size_diag_strmat(int m, int n);
+// returns the memory size (in bytes) needed for a strvec
+int s_size_strvec(int m);
+// create a strmat for a matrix of size m*n by using memory passed by a pointer (pointer is not updated)
+void s_create_strmat(int m, int n, struct s_strmat *sA, void *memory);
+// create a strvec for a vector of size m by using memory passed by a pointer (pointer is not updated)
+void s_create_strvec(int m, struct s_strvec *sA, void *memory);
+void s_cvt_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj);
+void s_cvt_vec2strvec(int m, float *a, struct s_strvec *sa, int ai);
+void s_cvt_tran_mat2strmat(int m, int n, float *A, int lda, struct s_strmat *sA, int ai, int aj);
+void s_cvt_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda);
+void s_cvt_strvec2vec(int m, struct s_strvec *sa, int ai, float *a);
+void s_cvt_tran_strmat2mat(int m, int n, struct s_strmat *sA, int ai, int aj, float *A, int lda);
+void s_cast_mat2strmat(float *A, struct s_strmat *sA);
+void s_cast_diag_mat2strmat(float *dA, struct s_strmat *sA);
+void s_cast_vec2vecmat(float *a, struct s_strvec *sa);
+void sgein1_libstr(float a, struct s_strmat *sA, int ai, int aj);
+float sgeex1_libstr(struct s_strmat *sA, int ai, int aj);
+void svecin1_libstr(float a, struct s_strvec *sx, int xi);
+float svecex1_libstr(struct s_strvec *sx, int xi);
+// A <= alpha
+void sgese_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj);
+// a <= alpha
+void svecse_libstr(int m, float alpha, struct s_strvec *sx, int xi);
+void sgecp_lib(int m, int n, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void sgecp_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sgesc_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj);
+void sveccp_libstr(int m, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci);
+void svecsc_libstr(int m, float alpha, struct s_strvec *sa, int ai);
+void strcp_l_lib(int m, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void strcp_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sgead_lib(int m, int n, float alpha, int offsetA, float *A, int sda, int offsetB, float *B, int sdb);
+void sgead_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void svecad_libstr(int m, float alpha, struct s_strvec *sa, int ai, struct s_strvec *sc, int ci);
+void sgetr_lib(int m, int n, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void sgetr_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void strtr_l_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void strtr_l_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void strtr_u_lib(int m, float alpha, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void strtr_u_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void sdiareg_lib(int kmax, float reg, int offset, float *pD, int sdd);
+void sdiaex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi);
+void sdiain_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void sdiain_sqrt_lib(int kmax, float *x, int offset, float *pD, int sdd);
+void sdiaex_lib(int kmax, float alpha, int offset, float *pD, int sdd, float *x);
+void sdiaad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd);
+void sdiain_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd);
+void sdiain_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void sdiaex_libsp(int kmax, int *idx, float alpha, float *pD, int sdd, float *x);
+void sdiaex_sp_libstr(int kmax, float alpha, int *idx, struct s_strmat *sD, int di, int dj, struct s_strvec *sx, int xi);
+void sdiaad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void sdiaad_libsp(int kmax, int *idx, float alpha, float *x, float *pD, int sdd);
+void sdiaad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void sdiaadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD, int sdd);
+void sdiaadin_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, int *idx, struct s_strmat *sD, int di, int dj);
+void srowin_lib(int kmax, float alpha, float *x, float *pD);
+void srowin_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void srowex_lib(int kmax, float alpha, float *pD, float *x);
+void srowex_libstr(int kmax, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi);
+void srowad_lib(int kmax, float alpha, float *x, float *pD);
+void srowad_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void srowin_libsp(int kmax, float alpha, int *idx, float *x, float *pD);
+void srowad_libsp(int kmax, int *idx, float alpha, float *x, float *pD);
+void srowad_sp_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strmat *sD, int di, int dj);
+void srowadin_libsp(int kmax, int *idx, float alpha, float *x, float *y, float *pD);
+void srowsw_lib(int kmax, float *pA, float *pC);
+void srowsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void srowpe_libstr(int kmax, int *ipiv, struct s_strmat *sA);
+void scolin_lib(int kmax, float *x, int offset, float *pD, int sdd);
+void scolin_libstr(int kmax, struct s_strvec *sx, int xi, struct s_strmat *sA, int ai, int aj);
+void scolad_lib(int kmax, float alpha, float *x, int offset, float *pD, int sdd);
+void scolin_libsp(int kmax, int *idx, float *x, float *pD, int sdd);
+void scolad_libsp(int kmax, float alpha, int *idx, float *x, float *pD, int sdd);
+void scolsw_lib(int kmax, int offsetA, float *pA, int sda, int offsetC, float *pC, int sdc);
+void scolsw_libstr(int kmax, struct s_strmat *sA, int ai, int aj, struct s_strmat *sC, int ci, int cj);
+void scolpe_libstr(int kmax, int *ipiv, struct s_strmat *sA);
+void svecin_libsp(int kmax, int *idx, float *x, float *y);
+void svecad_libsp(int kmax, int *idx, float alpha, float *x, float *y);
+void svecad_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi);
+void svecin_sp_libstr(int m, float alpha, struct s_strvec *sx, int xi, int *idx, struct s_strvec *sz, int zi);
+void svecex_sp_libstr(int m, float alpha, int *idx, struct s_strvec *sx, int x, struct s_strvec *sz, int zi);
+void sveccl_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi);
+void sveccl_mask_libstr(int m, struct s_strvec *sxm, int xim, struct s_strvec *sx, int xi, struct s_strvec *sxp, int xip, struct s_strvec *sz, int zi, struct s_strvec *sm, int mi);
+void svecze_libstr(int m, struct s_strvec *sm, int mi, struct s_strvec *sv, int vi, struct s_strvec *se, int ei);
+void svecnrm_inf_libstr(int m, struct s_strvec *sx, int xi, float *ptr_norm);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/include/blasfeo_s_aux_ext_dep.h b/include/blasfeo_s_aux_ext_dep.h
new file mode 100644
index 0000000..2b9f9d4
--- /dev/null
+++ b/include/blasfeo_s_aux_ext_dep.h
@@ -0,0 +1,111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+/* column-major matrices */
+
+// dynamically allocate row*col floats of memory and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros(float **pA, int row, int col);
+// dynamically allocate row*col floats of memory aligned to 64-byte boundaries and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros_align(float **pA, int row, int col);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to float; set allocated memory to zero
+void s_zeros_align_bytes(float **pA, int size);
+// free the memory allocated by d_zeros
+void s_free(float *pA);
+// free the memory allocated by d_zeros_align or d_zeros_align_bytes
+void s_free_align(float *pA);
+// print a column-major matrix
+void s_print_mat(int m, int n, float *A, int lda);
+// print the transposed of a column-major matrix
+void s_print_tran_mat(int row, int col, float *A, int lda);
+// print to file a column-major matrix
+void s_print_to_file_mat(FILE *file, int row, int col, float *A, int lda);
+// print to file the transposed of a column-major matrix
+void s_print_tran_to_file_mat(FILE *file, int row, int col, float *A, int lda);
+// print in exponential notation a column-major matrix
+void s_print_e_mat(int m, int n, float *A, int lda);
+// print in exponential notation the transposed of a column-major matrix
+void s_print_e_tran_mat(int row, int col, float *A, int lda);
+
+/* strmat and strvec */
+
+#ifdef BLASFEO_COMMON
+// create a strmat for a matrix of size m*n by dynamically allocating memory
+void s_allocate_strmat(int m, int n, struct s_strmat *sA);
+// create a strvec for a vector of size m by dynamically allocating memory
+void s_allocate_strvec(int m, struct s_strvec *sa);
+// free the memory allocated by d_allocate_strmat
+void s_free_strmat(struct s_strmat *sA);
+// free the memory allocated by d_allocate_strvec
+void s_free_strvec(struct s_strvec *sa);
+// print a strmat
+void s_print_strmat(int m, int n, struct s_strmat *sA, int ai, int aj);
+// print in exponential notation a strmat
+void s_print_e_strmat(int m, int n, struct s_strmat *sA, int ai, int aj);
+// print to file a strmat
+void s_print_to_file_strmat(FILE *file, int m, int n, struct s_strmat *sA, int ai, int aj);
+// print a strvec
+void s_print_strvec(int m, struct s_strvec *sa, int ai);
+// print in exponential notation a strvec
+void s_print_e_strvec(int m, struct s_strvec *sa, int ai);
+// print to file a strvec
+void s_print_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai);
+// print the transposed of a strvec
+void s_print_tran_strvec(int m, struct s_strvec *sa, int ai);
+// print in exponential notation the transposed of a strvec
+void s_print_e_tran_strvec(int m, struct s_strvec *sa, int ai);
+// print to file the transposed of a strvec
+void s_print_tran_to_file_strvec(FILE *file, int m, struct s_strvec *sa, int ai);
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/include/blasfeo_s_blas.h b/include/blasfeo_s_blas.h
new file mode 100644
index 0000000..a0170a5
--- /dev/null
+++ b/include/blasfeo_s_blas.h
@@ -0,0 +1,160 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+//
+// level 1 BLAS
+//
+
+// y = y + alpha*x
+void saxpy_libstr(int kmax, float alpha, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// z = x .* y, return sum(z) = x^T * y
+float svecmuldot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// return x^T * y
+float sdot_libstr(int m, struct s_strvec *sx, int xi, struct s_strvec *sy, int yi);
+
+
+
+//
+// level 2 BLAS
+//
+
+// dense
+
+// z <= beta * y + alpha * A * x
+void sgemv_n_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// z <= beta * y + alpha * A' * x
+void sgemv_t_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(n)
+void strsv_lnn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(n)
+void strsv_ltn_mn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, not_unit
+void strsv_lnn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A ) * x, A (m)x(m) lower, not_transposed, unit
+void strsv_lnu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, not_unit
+void strsv_ltn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) lower, transposed, unit
+void strsv_ltu_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, not_transposed, not_unit
+void strsv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// y <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
+void strsv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= beta * y + alpha * A * x ; A upper triangular
+void strmv_unn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A' * x ; A upper triangular
+void strmv_utn_libstr(int m, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void strmv_lnn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void strmv_ltn_libstr(int m, int n, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, struct s_strvec *sz, int zi);
+// z_n <= beta_n * y_n + alpha_n * A  * x_n
+// z_t <= beta_t * y_t + alpha_t * A' * x_t
+void sgemv_nt_libstr(int m, int n, float alpha_n, float alpha_t, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx_n, int xi_n, struct s_strvec *sx_t, int xi_t, float beta_n, float beta_t, struct s_strvec *sy_n, int yi_n, struct s_strvec *sy_t, int yi_t, struct s_strvec *sz_n, int zi_n, struct s_strvec *sz_t, int zi_t);
+// z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
+void ssymv_l_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+
+// diagonal
+
+// z <= beta * y + alpha * A * x, A diagonal
+void sgemv_diag_libstr(int m, float alpha, struct s_strvec *sA, int ai, struct s_strvec *sx, int xi, float beta, struct s_strvec *sy, int yi, struct s_strvec *sz, int zi);
+
+
+
+//
+// level 3 BLAS
+//
+
+// dense
+
+// D <= beta * C + alpha * A * B^T
+void sgemm_nt_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B
+void sgemm_nn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= beta * C + alpha * A * B^T ; C, D lower triangular
+void ssyrk_ln_libstr(int m, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+void ssyrk_ln_mn_libstr(int m, int n, int k, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^T ; B upper triangular
+void strmm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A ; A lower triangular
+void strmm_rlnn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular employing explicit inverse of diagonal
+void strsm_rltn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A lower triangular with unit diagonal
+void strsm_rltu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * B * A^{-T} , with A upper triangular employing explicit inverse of diagonal
+void strsm_rutn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A lower triangular with unit diagonal
+void strsm_llnu_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A^{-1} * B , with A upper triangular employing explicit inverse of diagonal
+void strsm_lunn_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sD, int di, int dj);
+
+// diagonal
+
+// D <= alpha * A * B + beta * C, with A diagonal (stored as strvec)
+void sgemm_diag_left_ib(int m, int n, float alpha, float *dA, float *pB, int sdb, float beta, float *pC, int sdc, float *pD, int sdd);
+void sgemm_l_diag_libstr(int m, int n, float alpha, struct s_strvec *sA, int ai, struct s_strmat *sB, int bi, int bj, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= alpha * A * B + beta * C, with B diagonal (stored as strvec)
+void sgemm_r_diag_libstr(int m, int n, float alpha, struct s_strmat *sA, int ai, int aj, struct s_strvec *sB, int bi, float beta, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+
+
+
+//
+// LAPACK
+//
+
+// D <= chol( C ) ; C, D lower triangular
+void spotrf_l_libstr(int m, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+void spotrf_l_mn_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= chol( C + A * B' ) ; C, D lower triangular
+void ssyrk_spotrf_ln_libstr(int m, int n, int k, struct s_strmat *sA, int ai, int aj, struct s_strmat *sB, int bi, int bj, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= lu( C ) ; no pivoting
+void sgetrf_nopivot_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj);
+// D <= lu( C ) ; pivoting
+void sgetrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, int *ipiv);
+// D <= qr( C )
+void sgeqrf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work);
+int sgeqrf_work_size_libstr(int m, int n); // in bytes
+// D <= lq( C )
+void sgelqf_libstr(int m, int n, struct s_strmat *sC, int ci, int cj, struct s_strmat *sD, int di, int dj, void *work);
+int sgelqf_work_size_libstr(int m, int n); // in bytes
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_s_kernel.h b/include/blasfeo_s_kernel.h
new file mode 100644
index 0000000..c0dc2b0
--- /dev/null
+++ b/include/blasfeo_s_kernel.h
@@ -0,0 +1,355 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// lib8
+//
+
+// 24x4
+void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nn_24x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+// 16x4
+void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km_, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+// 8x8
+void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+void kernel_sgemm_nn_8x8_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nn_8x8_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+// 8x4
+void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+void kernel_sgemm_nn_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+//void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+// 4x8
+void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+// 8
+void kernel_sgemv_n_8_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_n_8_vs_lib8(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_n_8_gen_lib8(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1);
+void kernel_sgemv_t_8_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_8_vs_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_8_gen_lib8(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_sgemv_t_4_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_4_vs_lib8(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_4_gen_lib8(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_strsv_ln_inv_8_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_ln_inv_8_vs_lib8(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_strsv_lt_inv_8_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_8_vs_lib8(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_sgemv_nt_4_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_vs_lib8(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km);
+void kernel_ssymv_l_4l_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z);
+void kernel_ssymv_l_4r_lib8(int kmax, float *alpha, float *A, int sda, float *x, float *z);
+void kernel_ssymv_l_4l_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km);
+void kernel_ssymv_l_4r_gen_lib8(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *z, int km);
+
+// aux
+void kernel_sgecp_8_0_lib8(int m, float *A, float *B);
+void kernel_sgecp_8_0_gen_lib8(int m, float *A, float *B, int m1);
+void kernel_sgecp_8_1_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_2_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_3_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_4_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_5_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_6_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgecp_8_7_lib8(int m, float *A, int sda, float *B);
+void kernel_sgecp_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgesc_8_lib8(int m, float *alpha, float *A);
+void kernel_sgesc_8_gen_lib8(int m, float *alpha, float *A, int m1);
+void kernel_sgetr_8_0_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_0_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_1_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_1_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_2_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_2_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_3_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_3_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_4_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_4_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_5_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_5_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_6_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_6_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgetr_8_7_lib8(int m, float *A, int sda, float *B);
+void kernel_sgetr_8_7_gen_lib8(int m, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_0_lib8(int m, float *alpha, float *A, float *B);
+void kernel_sgead_8_0_gen_lib8(int m, float *alpha, float *A, float *B, int m1);
+void kernel_sgead_8_1_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_1_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_2_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_2_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_3_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_3_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_4_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_4_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_5_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_5_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_6_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_6_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+void kernel_sgead_8_7_lib8(int m, float *alpha, float *A, int sda, float *B);
+void kernel_sgead_8_7_gen_lib8(int m, float *alpha, float *A, int sda, float *B, int m1);
+
+
+//
+// lib4
+//
+
+
+
+// level 2 BLAS
+// 4
+void kernel_sgemv_n_4_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_n_4_vs_lib4(int k, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1);
+void kernel_sgemv_t_4_lib4(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z);
+void kernel_sgemv_t_4_vs_lib4(int k, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1);
+void kernel_sgemv_t_4_gen_lib4(int k, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *C, float *D, int km);
+void kernel_strsv_ln_inv_4_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_ln_inv_4_vs_lib4(int k, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn);
+void kernel_strsv_lt_inv_4_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_3_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_2_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strsv_lt_inv_1_lib4(int k, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z);
+void kernel_strmv_un_4_lib4(int k, float *A, float *x, float *z);
+void kernel_strmv_ut_4_lib4(int k, float *A, int sda, float *x, float *z);
+void kernel_strmv_ut_4_vs_lib4(int k, float *A, int sda, float *x, float *z, int km);
+void kernel_sgemv_nt_6_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t);
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km);
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n);
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km);
+
+
+
+// level 3 BLAS
+// 12x4
+void kernel_sgemm_nt_12x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); //
+// 8x8
+void kernel_sgemm_nt_8x8_lib4(int k, float *alpha, float *A, int sda, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd); //
+// 8x4
+void kernel_sgemm_nt_8x4_lib4(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd); //
+// 4x4
+void kernel_sgemm_nt_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_sgemm_nt_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_sgemm_nt_4x4_gen_lib4(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int k0, int k1);
+void kernel_sgemm_nn_4x4_lib4(int k, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D); //
+void kernel_sgemm_nn_4x4_vs_lib4(int k, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); //
+void kernel_ssyrk_nt_l_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); //
+void kernel_strmm_nt_ru_4x4_vs_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); //
+void kernel_strmm_nn_rl_4x4_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+void kernel_strmm_nn_rl_4x4_gen_lib4(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E);
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, int km, int kn);
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nn_ru_inv_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_strsm_nn_ll_one_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E);
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn);
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+// diag
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd);
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D);
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+
+
+// LAPACK
+// 4x4
+void kernel_spotrf_nt_l_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+void kernel_spotrf_nt_l_4x4_vs_lib4(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_sgetrf_nn_4x4_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D);
+void kernel_sgetrf_nn_4x4_vs_lib4(int k, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv);
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv);
+
+
+
+// merged routines
+// 4x4
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+
+
+// auxiliary routines
+void kernel_sgesc_4_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_3_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_2_lib4(int kmax, float *alpha, float *A);
+void kernel_sgesc_1_lib4(int kmax, float *alpha, float *A);
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B);
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_2_0_lib4(int kmax, float *A, float *B);
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B);
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B);
+void kernel_sgead_4_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_4_1_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_4_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_4_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_3_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_3_2_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_3_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_2_0_lib4(int kmax, float *alpha, float *A, float *B);
+void kernel_sgead_2_3_lib4(int kmax, float *alpha, float *A0, int sda, float *B);
+void kernel_sgead_1_0_lib4(int kmax, float *alpha, float *A, float *B);
+// TODO
+void kernel_sgeset_4_lib4(int kmax, float alpha, float *A);
+void kernel_strset_4_lib4(int kmax, float alpha, float *A);
+void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/include/blasfeo_v_aux_ext_dep.h b/include/blasfeo_v_aux_ext_dep.h
new file mode 100644
index 0000000..2555fab
--- /dev/null
+++ b/include/blasfeo_v_aux_ext_dep.h
@@ -0,0 +1,71 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(EXT_DEP)
+
+
+
+#include <stdio.h>
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/************************************************
+* d_aux_extern_depend_lib.c
+************************************************/
+
+void v_zeros(void **ptrA, int size);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to void; set allocated memory to zero
+void v_zeros_align(void **ptrA, int size);
+// free the memory allocated by v_zeros
+void v_free(void *ptrA);
+// free the memory allocated by v_zeros_aligned
+void v_free_align(void *ptrA);
+// dynamically allocate size bytes of memory and set accordingly a pointer to char; set allocated memory to zero
+void c_zeros(char **ptrA, int size);
+// dynamically allocate size bytes of memory aligned to 64-byte boundaries and set accordingly a pointer to char; set allocated memory to zero
+void c_zeros_align(char **ptrA, int size);
+// free the memory allocated by c_zeros
+void c_free(char *ptrA);
+// free the memory allocated by c_zeros_aligned
+void c_free_align(char *ptrA);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif // EXT_DEP
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..60e1f31
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,75 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+obj:
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+	( cd avx2; $(MAKE) obj)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+	( cd avx; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+	( cd sse3; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+	( cd fma; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+	( cd armv8a; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+	( cd armv7a; $(MAKE) obj)
+	( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), GENERIC)
+	( cd c99; $(MAKE) obj)
+endif
+
+clean:
+	make -C avx2 clean
+	make -C avx clean
+	make -C sse3 clean
+	make -C fma clean
+	make -C armv8a clean
+	make -C armv7a clean
+	make -C c99 clean
+
diff --git a/kernel/armv7a/Makefile b/kernel/armv7a/Makefile
new file mode 100644
index 0000000..4cb59a7
--- /dev/null
+++ b/kernel/armv7a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_12x4_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/armv7a/kernel_dgemm_4x4_lib4.S b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..86aee4f
--- /dev/null
+++ b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3223 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A even
+	fldd	d16, [r5, #0]
+	fldd	d17, [r5, #8]
+	fldd	d18, [r5, #16]
+	fldd	d19, [r5, #24]
+
+	// preload B even
+	fldd	d20, [r6, #0]
+	fldd	d21, [r6, #8]
+	fldd	d22, [r6, #16]
+	fldd	d23, [r6, #24]
+
+	// preload A odd
+	fldd	d24, [r5, #32]
+	fldd	d25, [r5, #40]
+	fldd	d26, [r5, #48]
+	fldd	d27, [r5, #56]
+
+	// preload B odd
+	fldd	d28, [r6, #32]
+	fldd	d29, [r6, #40]
+	fldd	d30, [r6, #48]
+	fldd	d31, [r6, #56]
+
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, #64]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// unroll 0
+	fmacd	d0, d16, d20
+	pld		[r5, #128] // prefetch
+	fmacd	d1, d17, d20
+	pld		[r6, #128] // prefetch
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	pld		[r6, #192] // prefetch
+	fmacd	d1, d17, d20
+	add		r6, r6, #128
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #0] // B
+
+	fmacd	d4, d16, d21
+	pld		[r5, #192] // prefetch
+	fmacd	d5, d17, d21
+	add		r5, r5, #128
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #8] // B
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #16] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #0] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #8] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #16] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #24] // A
+	fldd	d23, [r6, #24] // B
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #32] // B
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #40] // B
+
+	fmacd	d8, d24, d30
+	sub		r4, r4, #4
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #48] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #32] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #40] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #48] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #56] // A
+	fldd	d31, [r6, #56] // B
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fmacd	d12, d16, d23
+	fmacd	d13, d17, d23
+	fmacd	d14, d18, d23
+	fmacd	d15, d19, d23
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+
+	fmacd	d12, d24, d31
+	fmacd	d13, d25, d31
+	fmacd	d14, d26, d31
+	fmacd	d15, d27, d31
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fldd	d21, [r6, #8] // B
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fldd	d22, [r6, #16] // B
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fldd	d23, [r6, #24] // B
+	fmacd	d12, d16, d23
+	fmacd	d13, d17, d23
+	fmacd	d14, d18, d23
+	fmacd	d15, d19, d23
+
+	add		r5, r5, #32
+	add		r6, r6, #32
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+// r7   <- 4*sdb*sizeof(double)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x4_lib4, %function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+	pld		[r6, #64]
+
+	// preload A even
+	fldd	d16, [r5, #0]
+	fldd	d17, [r5, #8]
+	fldd	d18, [r5, #16]
+	fldd	d19, [r5, #24]
+
+	// preload B even
+	fldd	d20, [r6, #0]
+	fldd	d21, [r6, #32]
+	fldd	d22, [r6, #64]
+	fldd	d23, [r6, #96]
+
+	// preload A odd
+	fldd	d24, [r5, #32]
+	fldd	d25, [r5, #40]
+	fldd	d26, [r5, #48]
+	fldd	d27, [r5, #56]
+
+	// preload B odd
+	fldd	d28, [r6, #8]
+	fldd	d29, [r6, #40]
+	fldd	d30, [r6, #72]
+	fldd	d31, [r6, #104]
+
+	// prefetch
+	pld		[r5, #64]
+
+	// B next
+	add		r9, r7, r6
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// unroll 0
+	fmacd	d0, d16, d20
+	pld		[r5, #128] // prefetch
+	fmacd	d1, d17, d20
+	pld		[r9, #0]
+	fmacd	d2, d18, d20
+	pld		[r9, #64]
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #16] // B
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #48] // B
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #112] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #24] // B
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #56] // B
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #88] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	pld		[r5, #192] // prefetch
+	fmacd	d1, d17, d20
+	mov		r6, r9
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #0] // B
+
+	fmacd	d4, d16, d21
+	add		r5, r5, #128
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #32] // B
+
+	fmacd	d8, d16, d22
+	add		r9, r9, r7
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #64] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #0] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #8] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #16] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #24] // A
+	fldd	d23, [r6, #96] // B
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #8] // B
+
+	fmacd	d4, d24, d29
+	sub		r4, r4, #4
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #40] // B
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #72] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #32] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #40] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #48] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #56] // A
+	fldd	d31, [r6, #104] // B
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #16] // B
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #48] // B
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #112] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #24] // B
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #56] // B
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #88] // B
+
+	fmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	add		r5, r5, #128
+	mov		r6, r9
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fmacd	d12, d16, d23
+	fmacd	d13, d17, d23
+	fmacd	d14, d18, d23
+	fmacd	d15, d19, d23
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+
+	fmacd	d4, d24, d29
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+
+	fmacd	d8, d24, d30
+	fmacd	d9, d25, d30
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+
+	fmacd	d12, d24, d31
+	fmacd	d13, d25, d31
+	fmacd	d14, d26, d31
+	fmacd	d15, d27, d31
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fldd	d21, [r6, #32] // B
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fldd	d22, [r6, #64] // B
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fldd	d23, [r6, #96] // B
+	fmacd	d12, d16, d23
+	fmacd	d13, d17, d23
+	fmacd	d14, d18, d23
+	fmacd	d15, d19, d23
+
+	add		r5, r5, #32
+	add		r6, r6, #8
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dsyrk_l_add_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A even
+	fldd	d16, [r5, #0]
+	fldd	d17, [r5, #8]
+	fldd	d18, [r5, #16]
+	fldd	d19, [r5, #24]
+
+	// preload B even
+	fldd	d20, [r6, #0]
+	fldd	d21, [r6, #8]
+	fldd	d22, [r6, #16]
+	fldd	d23, [r6, #24]
+
+	// preload A odd
+	fldd	d24, [r5, #32]
+	fldd	d25, [r5, #40]
+	fldd	d26, [r5, #48]
+	fldd	d27, [r5, #56]
+
+	// preload B odd
+	fldd	d28, [r6, #32]
+	fldd	d29, [r6, #40]
+	fldd	d30, [r6, #48]
+	fldd	d31, [r6, #56]
+
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, #64]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #128]
+	pld		[r6, #128]
+
+	// unroll 0
+	fmacd	d0, d16, d20
+	fldd	d16, [r5, #64] // A
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fmacd	d5, d17, d21
+	fldd	d17, [r5, #72] // A
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fmacd	d10, d18, d22
+	fldd	d18, [r5, #80] // A
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fldd	d24, [r5, #96] // A
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fmacd	d5, d25, d29
+	fldd	d25, [r5, #104] // A
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fmacd	d10, d26, d30
+	fldd	d26, [r5, #112] // A
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	// prefetch
+	pld		[r5, #192]
+	pld		[r6, #192]
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	fldd	d16, [r5, #0] // A
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #0] // B
+
+	fmacd	d5, d17, d21
+	fldd	d17, [r5, #8] // A
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #8] // B
+
+	fmacd	d10, d18, d22
+	fldd	d18, [r5, #16] // A
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #16] // B
+
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #24] // A
+	fldd	d23, [r6, #24] // B
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fldd	d24, [r5, #32] // A
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #32] // B
+
+	fmacd	d5, d25, d29
+	fldd	d25, [r5, #40] // A
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #40] // B
+
+	fmacd	d10, d26, d30
+	fldd	d26, [r5, #48] // A
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #48] // B
+
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #56] // A
+	fldd	d31, [r6, #56] // B
+
+	sub		r4, r4, #4
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	fmacd	d0, d16, d20
+	fldd	d16, [r5, #64] // A
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fmacd	d5, d17, d21
+	fldd	d17, [r5, #72] // A
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fmacd	d10, d18, d22
+	fldd	d18, [r5, #80] // A
+	fmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fmacd	d0, d24, d28
+	fldd	d24, [r5, #96] // A
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fmacd	d5, d25, d29
+	fldd	d25, [r5, #104] // A
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fmacd	d10, d26, d30
+	fldd	d26, [r5, #112] // A
+	fmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fmacd	d15, d19, d23
+
+	// unroll 3
+	fmacd	d0, d24, d28
+	fmacd	d1, d25, d28
+	fmacd	d2, d26, d28
+	fmacd	d3, d27, d28
+
+	fmacd	d5, d25, d29
+	fmacd	d6, d26, d29
+	fmacd	d7, d27, d29
+
+	fmacd	d10, d26, d30
+	fmacd	d11, d27, d30
+
+	fmacd	d15, d27, d31
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fldd	d21, [r6, #8] // B
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fldd	d22, [r6, #16] // B
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fldd	d23, [r6, #24] // B
+	fmacd	d15, d19, d23
+
+	add		r5, r5, #32
+	add		r6, r6, #32
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dsyrk_l_add_nt_4x4_lib4, .-inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, %function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A even
+	fldd	d16, [r5, #0]
+	fldd	d17, [r5, #8]
+	fldd	d18, [r5, #16]
+	fldd	d19, [r5, #24]
+
+	// preload B even
+	fldd	d20, [r6, #0]
+	fldd	d21, [r6, #8]
+	fldd	d22, [r6, #16]
+	fldd	d23, [r6, #24]
+
+	// preload A odd
+	fldd	d24, [r5, #32]
+	fldd	d25, [r5, #40]
+	fldd	d26, [r5, #48]
+	fldd	d27, [r5, #56]
+
+	// preload B odd
+	fldd	d28, [r6, #32]
+	fldd	d29, [r6, #40]
+	fldd	d30, [r6, #48]
+	fldd	d31, [r6, #56]
+
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, #64]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #128]
+	pld		[r6, #128]
+
+	// unroll 0
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fnmacd	d4, d16, d21
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fnmacd	d8, d16, d22
+	fnmacd	d9, d17, d22
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fnmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fnmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fnmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fnmacd	d0, d24, d28
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fnmacd	d4, d24, d29
+	fnmacd	d5, d25, d29
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fnmacd	d8, d24, d30
+	fnmacd	d9, d25, d30
+	fnmacd	d10, d26, d30
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fnmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fnmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fnmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	// prefetch
+	pld		[r5, #192]
+	pld		[r6, #192]
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #0] // B
+
+	fnmacd	d4, d16, d21
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #8] // B
+
+	fnmacd	d8, d16, d22
+	fnmacd	d9, d17, d22
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #16] // B
+
+	fnmacd	d12, d16, d23
+	fldd	d16, [r5, #0] // A
+	fnmacd	d13, d17, d23
+	fldd	d17, [r5, #8] // A
+	fnmacd	d14, d18, d23
+	fldd	d18, [r5, #16] // A
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #24] // A
+	fldd	d23, [r6, #24] // B
+
+	// unroll 3
+	fnmacd	d0, d24, d28
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #32] // B
+
+	fnmacd	d4, d24, d29
+	fnmacd	d5, d25, d29
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #40] // B
+
+	fnmacd	d8, d24, d30
+	fnmacd	d9, d25, d30
+	fnmacd	d10, d26, d30
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #48] // B
+
+	fnmacd	d12, d24, d31
+	fldd	d24, [r5, #32] // A
+	fnmacd	d13, d25, d31
+	fldd	d25, [r5, #40] // A
+	fnmacd	d14, d26, d31
+	fldd	d26, [r5, #48] // A
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #56] // A
+	fldd	d31, [r6, #56] // B
+
+	sub		r4, r4, #4
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fnmacd	d4, d16, d21
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fnmacd	d8, d16, d22
+	fnmacd	d9, d17, d22
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fnmacd	d12, d16, d23
+	fldd	d16, [r5, #64] // A
+	fnmacd	d13, d17, d23
+	fldd	d17, [r5, #72] // A
+	fnmacd	d14, d18, d23
+	fldd	d18, [r5, #80] // A
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fnmacd	d0, d24, d28
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fnmacd	d4, d24, d29
+	fnmacd	d5, d25, d29
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fnmacd	d8, d24, d30
+	fnmacd	d9, d25, d30
+	fnmacd	d10, d26, d30
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fnmacd	d12, d24, d31
+	fldd	d24, [r5, #96] // A
+	fnmacd	d13, d25, d31
+	fldd	d25, [r5, #104] // A
+	fnmacd	d14, d26, d31
+	fldd	d26, [r5, #112] // A
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+
+	fnmacd	d4, d16, d21
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+
+	fnmacd	d8, d16, d22
+	fnmacd	d9, d17, d22
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+
+	fnmacd	d12, d16, d23
+	fnmacd	d13, d17, d23
+	fnmacd	d14, d18, d23
+	fnmacd	d15, d19, d23
+
+	// unroll 3
+	fnmacd	d0, d24, d28
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+
+	fnmacd	d4, d24, d29
+	fnmacd	d5, d25, d29
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+
+	fnmacd	d8, d24, d30
+	fnmacd	d9, d25, d30
+	fnmacd	d10, d26, d30
+	fnmacd	d11, d27, d30
+
+	fnmacd	d12, d24, d31
+	fnmacd	d13, d25, d31
+	fnmacd	d14, d26, d31
+	fnmacd	d15, d27, d31
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+
+	fldd	d21, [r6, #8] // B
+	fnmacd	d4, d16, d21
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+
+	fldd	d22, [r6, #16] // B
+	fnmacd	d8, d16, d22
+	fnmacd	d9, d17, d22
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+
+	fldd	d23, [r6, #24] // B
+	fnmacd	d12, d16, d23
+	fnmacd	d13, d17, d23
+	fnmacd	d14, d18, d23
+	fnmacd	d15, d19, d23
+
+	add		r5, r5, #32
+	add		r6, r6, #32
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dsyrk_l_sub_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A even
+	fldd	d16, [r5, #0]
+	fldd	d17, [r5, #8]
+	fldd	d18, [r5, #16]
+	fldd	d19, [r5, #24]
+
+	// preload B even
+	fldd	d20, [r6, #0]
+	fldd	d21, [r6, #8]
+	fldd	d22, [r6, #16]
+	fldd	d23, [r6, #24]
+
+	// preload A odd
+	fldd	d24, [r5, #32]
+	fldd	d25, [r5, #40]
+	fldd	d26, [r5, #48]
+	fldd	d27, [r5, #56]
+
+	// preload B odd
+	fldd	d28, [r6, #32]
+	fldd	d29, [r6, #40]
+	fldd	d30, [r6, #48]
+	fldd	d31, [r6, #56]
+
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, #64]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #128]
+	pld		[r6, #128]
+
+	// unroll 0
+	fnmacd	d0, d16, d20
+	fldd	d16, [r5, #64] // A
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fnmacd	d5, d17, d21
+	fldd	d17, [r5, #72] // A
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fnmacd	d10, d18, d22
+	fldd	d18, [r5, #80] // A
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fnmacd	d0, d24, d28
+	fldd	d24, [r5, #96] // A
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fnmacd	d5, d25, d29
+	fldd	d25, [r5, #104] // A
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fnmacd	d10, d26, d30
+	fldd	d26, [r5, #112] // A
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	// prefetch
+	pld		[r5, #192]
+	pld		[r6, #192]
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fnmacd	d0, d16, d20
+	fldd	d16, [r5, #0] // A
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #0] // B
+
+	fnmacd	d5, d17, d21
+	fldd	d17, [r5, #8] // A
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #8] // B
+
+	fnmacd	d10, d18, d22
+	fldd	d18, [r5, #16] // A
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #16] // B
+
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #24] // A
+	fldd	d23, [r6, #24] // B
+
+	// unroll 3
+	fnmacd	d0, d24, d28
+	fldd	d24, [r5, #32] // A
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #32] // B
+
+	fnmacd	d5, d25, d29
+	fldd	d25, [r5, #40] // A
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #40] // B
+
+	fnmacd	d10, d26, d30
+	fldd	d26, [r5, #48] // A
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #48] // B
+
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #56] // A
+	fldd	d31, [r6, #56] // B
+
+	sub		r4, r4, #4
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	fnmacd	d0, d16, d20
+	fldd	d16, [r5, #64] // A
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+	fldd	d20, [r6, #64] // B
+
+	fnmacd	d5, d17, d21
+	fldd	d17, [r5, #72] // A
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+	fldd	d21, [r6, #72] // B
+
+	fnmacd	d10, d18, d22
+	fldd	d18, [r5, #80] // A
+	fnmacd	d11, d19, d22
+	fldd	d22, [r6, #80] // B
+
+	fnmacd	d15, d19, d23
+	fldd	d19, [r5, #88] // A
+	fldd	d23, [r6, #88] // B
+
+	// unroll 1
+	fnmacd	d0, d24, d28
+	fldd	d24, [r5, #96] // A
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+	fldd	d28, [r6, #96] // B
+
+	fnmacd	d5, d25, d29
+	fldd	d25, [r5, #104] // A
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+	fldd	d29, [r6, #104] // B
+
+	fnmacd	d10, d26, d30
+	fldd	d26, [r5, #112] // A
+	fnmacd	d11, d27, d30
+	fldd	d30, [r6, #112] // B
+
+	fnmacd	d15, d27, d31
+	fldd	d27, [r5, #120] // A
+	fldd	d31, [r6, #120] // B
+
+	add		r5, r5, #128
+	add		r6, r6, #128
+
+	// unroll 2
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+
+	fnmacd	d15, d19, d23
+
+	// unroll 3
+	fnmacd	d0, d24, d28
+	fnmacd	d1, d25, d28
+	fnmacd	d2, d26, d28
+	fnmacd	d3, d27, d28
+
+	fnmacd	d5, d25, d29
+	fnmacd	d6, d26, d29
+	fnmacd	d7, d27, d29
+
+	fnmacd	d10, d26, d30
+	fnmacd	d11, d27, d30
+
+	fnmacd	d15, d27, d31
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fnmacd	d0, d16, d20
+	fnmacd	d1, d17, d20
+	fnmacd	d2, d18, d20
+	fnmacd	d3, d19, d20
+
+	fldd	d21, [r6, #8] // B
+	fnmacd	d5, d17, d21
+	fnmacd	d6, d18, d21
+	fnmacd	d7, d19, d21
+
+	fldd	d22, [r6, #16] // B
+	fnmacd	d10, d18, d22
+	fnmacd	d11, d19, d22
+
+	fldd	d23, [r6, #24] // B
+	fnmacd	d15, d19, d23
+
+	add		r5, r5, #32
+	add		r6, r6, #32
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dsyrk_l_sub_nt_4x4_lib4, .-inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+// r7   <- bs*sdb*sizeof(double)
+// r8   <- offsetB
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x4_lib4, %function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+	cmp		r8, #0
+	ble		2f // return
+
+	cmp		r4, #0
+	ble		2f // return
+
+	rsb		r9, r8, #4 // 4-offsetB
+	cmp		r9, r4
+//	ble		0f
+//	mov		r9, r4 // kend=min(k,4-offsetB(
+//0:
+	movgt	r9, r4 // kend=min(k,4-offsetB(
+	
+//	lsl		r10, r8, #3 // offsetB*sizeof(double)
+	add		r6, r6, r8, LSL #3 // B + offsetB*sizeof(double)
+
+1:
+	fldd	d16, [r5, #0] // A
+	fldd	d17, [r5, #8] // A
+	fldd	d18, [r5, #16] // A
+	fldd	d19, [r5, #24] // A
+
+	fldd	d20, [r6, #0] // B
+	fmacd	d0, d16, d20
+	fmacd	d1, d17, d20
+	fmacd	d2, d18, d20
+	fmacd	d3, d19, d20
+
+	fldd	d21, [r6, #32] // B
+	fmacd	d4, d16, d21
+	fmacd	d5, d17, d21
+	fmacd	d6, d18, d21
+	fmacd	d7, d19, d21
+
+	fldd	d22, [r6, #64] // B
+	fmacd	d8, d16, d22
+	fmacd	d9, d17, d22
+	fmacd	d10, d18, d22
+	fmacd	d11, d19, d22
+
+	fldd	d23, [r6, #96] // B
+	fmacd	d12, d16, d23
+	fmacd	d13, d17, d23
+	fmacd	d14, d18, d23
+	fmacd	d15, d19, d23
+
+	sub		r4, r4, #1
+	sub		r9, r9, #1
+	add		r5, r5, #32
+	add		r6, r6, #8
+
+	cmp		r9, #0
+	bgt		1b
+
+	cmp		r4, #0
+	ble		2f // return
+
+	add		r6, r6, r7
+	sub		r6, r6, #32
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+	
+
+
+
+
+// subroutine
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r4   <- E
+// r5   <- inv_diag_E
+//
+// output arguments:
+// r4   <- E
+// r5   <- inv_diag_E
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, %function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	// first column
+	fldd		d16, [r5, #0] // E_inv[0]
+	fmuld		d0, d0, d16
+	fmuld		d1, d1, d16
+	fmuld		d2, d2, d16
+	fmuld		d3, d3, d16
+
+	// second column
+	fldd		d16, [r4, #8] // E[1+4*0]
+	fnmacd		d4, d0, d16
+	fnmacd		d5, d1, d16
+	fnmacd		d6, d2, d16
+	fnmacd		d7, d3, d16
+	fldd		d16, [r5, #8] // E_inv[1]
+	fmuld		d4, d4, d16
+	fmuld		d5, d5, d16
+	fmuld		d6, d6, d16
+	fmuld		d7, d7, d16
+
+	// third column
+	fldd		d16, [r4, #16] // E[2+4*0]
+	fnmacd		d8, d0, d16
+	fnmacd		d9, d1, d16
+	fnmacd		d10, d2, d16
+	fnmacd		d11, d3, d16
+	fldd		d16, [r4, #48] // E[2+4*1]
+	fnmacd		d8, d4, d16
+	fnmacd		d9, d5, d16
+	fnmacd		d10, d6, d16
+	fnmacd		d11, d7, d16
+	fldd		d16, [r5, #16] // E_inv[2]
+	fmuld		d8, d8, d16
+	fmuld		d9, d9, d16
+	fmuld		d10, d10, d16
+	fmuld		d11, d11, d16
+
+	// fourth column
+	fldd		d16, [r4, #24] // E[3+4*0]
+	fnmacd		d12, d0, d16
+	fnmacd		d13, d1, d16
+	fnmacd		d14, d2, d16
+	fnmacd		d15, d3, d16
+	fldd		d16, [r4, #56] // E[3+4*1]
+	fnmacd		d12, d4, d16
+	fnmacd		d13, d5, d16
+	fnmacd		d14, d6, d16
+	fnmacd		d15, d7, d16
+	fldd		d16, [r4, #88] // E[3+4*2]
+	fnmacd		d12, d8, d16
+	fnmacd		d13, d9, d16
+	fnmacd		d14, d10, d16
+	fnmacd		d15, d11, d16
+	fldd		d16, [r5, #24] // E_inv[3]
+	fmuld		d12, d12, d16
+	fmuld		d13, d13, d16
+	fmuld		d14, d14, d16
+	fmuld		d15, d15, d16
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// cholesky factorization 
+//
+// input arguments:
+// r4   <- inv_diag_D
+//
+// output arguments:
+// r4   <- inv_diag_D
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_lib4, %function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+	
+	fconstd		d16, #112 // 1.0
+	fldd		d17, .LC01 // 0.0
+
+	// first column
+	fcmped		d0, d17
+	fmstat
+	ble			1f
+	fsqrtd		d0, d0
+	fdivd		d18, d16, d0
+	fstd		d18, [r4, #0]
+2:
+	fmuld		d1, d1, d18
+	fmuld		d2, d2, d18
+	fmuld		d3, d3, d18
+
+	// second column
+	fnmacd		d5, d1, d1
+	fnmacd		d6, d1, d2
+	fnmacd		d7, d1, d3
+	fcmped		d5, d17
+	fmstat
+	ble			3f
+	fsqrtd		d5, d5
+	fdivd		d18, d16, d5
+	fstd		d18, [r4, #8]
+4:
+	fmuld		d6, d6, d18
+	fmuld		d7, d7, d18
+
+	// third column
+	fnmacd		d10, d2, d2
+	fnmacd		d11, d2, d3
+	fnmacd		d10, d6, d6
+	fnmacd		d11, d6, d7
+	fcmped		d10, d17
+	fmstat
+	ble			5f
+	fsqrtd		d10, d10
+	fdivd		d18, d16, d10
+	fstd		d18, [r4, #16]
+6:
+	fmuld		d11, d11, d18
+
+	// fourth column
+	fnmacd		d15, d3, d3
+	fnmacd		d15, d7, d7
+	fnmacd		d15, d11, d11
+	fcmped		d15, d17
+	fmstat
+	ble			7f
+	fsqrtd		d15, d15
+	fdivd		d18, d16, d15
+	fstd		d18, [r4, #24]
+
+	b			0f
+
+1:
+	fldd		d0, .LC01
+	b			2b
+
+3:
+	fldd		d5, .LC01
+	b			4b
+
+5:
+	fldd		d10, .LC01
+	b			6b
+
+7:
+	fldd		d15, .LC01
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+	.align 3
+.LC01: // { 0 }
+	.word 0
+	.word 0
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- alpha
+// r5   <- beta
+// r6   <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+	fldd	d16, [r4, #0] // alpha
+
+	fmuld	d0, d0, d16
+	fldd	d18, [r5, #0] // beta
+	fmuld	d1, d1, d16
+	fldd	d17, .LC01 // 0.0
+	fmuld	d2, d2, d16
+	fmuld	d3, d3, d16
+
+	fmuld	d4, d4, d16
+	fmuld	d5, d5, d16
+	fmuld	d6, d6, d16
+	fmuld	d7, d7, d16
+
+	fmuld	d8, d8, d16
+	fcmped	d18, d17
+	fmuld	d9, d9, d16
+	fmuld	d10, d10, d16
+	fmuld	d11, d11, d16
+
+	fmuld	d12, d12, d16
+	fmstat
+	fmuld	d13, d13, d16
+	fmuld	d14, d14, d16
+	fmuld	d15, d15, d16
+
+	beq		0f // end
+
+	fldd	d17, [r6, #0] // C
+	fmacd	d0, d18, d17
+	fldd	d17, [r6, #8] // C
+	fmacd	d1, d18, d17
+	fldd	d17, [r6, #16] // C
+	fmacd	d2, d18, d17
+	fldd	d17, [r6, #24] // C
+	fmacd	d3, d18, d17
+
+	fldd	d17, [r6, #32] // C
+	fmacd	d4, d18, d17
+	fldd	d17, [r6, #40] // C
+	fmacd	d5, d18, d17
+	fldd	d17, [r6, #48] // C
+	fmacd	d6, d18, d17
+	fldd	d17, [r6, #56] // C
+	fmacd	d7, d18, d17
+
+	fldd	d17, [r6, #64] // C
+	fmacd	d8, d18, d17
+	fldd	d17, [r6, #72] // C
+	fmacd	d9, d18, d17
+	fldd	d17, [r6, #80] // C
+	fmacd	d10, d18, d17
+	fldd	d17, [r6, #88] // C
+	fmacd	d11, d18, d17
+
+	fldd	d17, [r6, #96] // C
+	fmacd	d12, d18, d17
+	fldd	d17, [r6, #104] // C
+	fmacd	d13, d18, d17
+	fldd	d17, [r6, #112] // C
+	fmacd	d14, d18, d17
+	fldd	d17, [r6, #120] // C
+	fmacd	d15, d18, d17
+
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_11_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_4x4_lib4, %function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+	fldd	d17, [r4, #0] // C
+	faddd	d0, d0, d17
+	fldd	d17, [r4, #8] // C
+	faddd	d1, d1, d17
+	fldd	d17, [r4, #16] // C
+	faddd	d2, d2, d17
+	fldd	d17, [r4, #24] // C
+	faddd	d3, d3, d17
+
+	fldd	d17, [r4, #32] // C
+	faddd	d4, d4, d17
+	fldd	d17, [r4, #40] // C
+	faddd	d5, d5, d17
+	fldd	d17, [r4, #48] // C
+	faddd	d6, d6, d17
+	fldd	d17, [r4, #56] // C
+	faddd	d7, d7, d17
+
+	fldd	d17, [r4, #64] // C
+	faddd	d8, d8, d17
+	fldd	d17, [r4, #72] // C
+	faddd	d9, d9, d17
+	fldd	d17, [r4, #80] // C
+	faddd	d10, d10, d17
+	fldd	d17, [r4, #88] // C
+	faddd	d11, d11, d17
+
+	fldd	d17, [r4, #96] // C
+	faddd	d12, d12, d17
+	fldd	d17, [r4, #104] // C
+	faddd	d13, d13, d17
+	fldd	d17, [r4, #112] // C
+	faddd	d14, d14, d17
+	fldd	d17, [r4, #120] // C
+	faddd	d15, d15, d17
+
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+	fstd	d0, [r4, #0]
+	fstd	d1, [r4, #8]
+	fstd	d2, [r4, #16]
+	fstd	d3, [r4, #24]
+
+	fstd	d4, [r4, #32]
+	fstd	d5, [r4, #40]
+	fstd	d6, [r4, #48]
+	fstd	d7, [r4, #56]
+
+	fstd	d8, [r4, #64]
+	fstd	d9, [r4, #72]
+	fstd	d10, [r4, #80]
+	fstd	d11, [r4, #88]
+
+	fstd	d12, [r4, #96]
+	fstd	d13, [r4, #104]
+	fstd	d14, [r4, #112]
+	fstd	d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, %function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#endif
+#endif
+
+	fstd	d0, [r4, #0]
+	fstd	d1, [r4, #8]
+	fstd	d2, [r4, #16]
+	fstd	d3, [r4, #24]
+
+//	fstd	d4, [r4, #32]
+	fstd	d5, [r4, #40]
+	fstd	d6, [r4, #48]
+	fstd	d7, [r4, #56]
+
+//	fstd	d8, [r4, #64]
+//	fstd	d9, [r4, #72]
+	fstd	d10, [r4, #80]
+	fstd	d11, [r4, #88]
+
+//	fstd	d12, [r4, #96]
+//	fstd	d13, [r4, #104]
+//	fstd	d14, [r4, #112]
+	fstd	d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+	.align 3
+.LC00: // { 0 }
+	.word 0
+	.word 0
+
+//                               r0        r1             r2         r3         sp+0          sp+4       sp+8
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_dgemm_nt_4x4_lib4
+	.type	kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #0] // beta
+	ldr		r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                               r0        r1             r2         r3           sp+0       sp+4     sp+8          sp+12      sp+16
+// void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_dgemm_nn_4x4_lib4
+	.type	kernel_dgemm_nn_4x4_lib4, %function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	ldr		r6, [fp, #0] // B
+	ldr		r7, [fp, #4] // sdb
+	lsl		r7, r7, #5 // 4*sizeof(double)*sdb
+	mov		r8, r3 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #8] // beta
+	ldr		r6, [fp, #12] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #16] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                 r0        r1             r2         r3         sp+0          sp+4       sp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, %function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dsyrk l nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #0] // beta
+	ldr		r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store l
+	ldr		r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                      r0        r1         r2         r3         sp+0       sp+4       rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dsyrk l nt
+	mov		r4, r0 // kmax
+	mov		r5, r1 // A
+	mov		r6, r2 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for alpha=1.0 and beta=1.0
+	mov		r4, r3 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+	// factorization
+	ldr		r4, [fp, #4] // E
+	ldr		r5, [fp, #8] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+	// store l
+	ldr		r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+//                                  r0        r1         r2         r3         sp+0       sp+4
+// void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, %function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dsyrk l nt
+	mov		r4, r0 // kmax
+	mov		r5, r1 // A
+	mov		r6, r2 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for alpha=1.0 and beta=1.0
+	mov		r4, r3 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+	// factorization
+	ldr		r4, [fp, #4] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+	// store l
+	ldr		r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+//                                            r0      r1          r2          r3      sp+0        sp+4        sp+8       sp+12      sp+16      sp+20
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dsyrk l nt add
+	mov		r4, r0 // kp
+	mov		r5, r1 // Ap
+	mov		r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner kernel dsyrk l nt sub
+	mov		r4, r3 // kmax
+	ldr		r5, [fp, #0] // Am
+	ldr		r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for alpha=1.0 and beta=1.0
+	ldr		r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+	// factorization
+	ldr		r4, [fp, #16] // E
+	ldr		r5, [fp, #20] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+	// store l
+	ldr		r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+//                                        r0      r1          r2          r3      sp+0        sp+4        sp+8       sp+12      sp+16
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, %function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	fldd	d0, .LC00
+	fcpyd	d1, d0
+	fcpyd	d2, d0
+	fcpyd	d3, d0
+	fcpyd	d4, d0
+	fcpyd	d5, d0
+	fcpyd	d6, d0
+	fcpyd	d7, d0
+	fcpyd	d8, d0
+	fcpyd	d9, d0
+	fcpyd	d10, d0
+	fcpyd	d11, d0
+	fcpyd	d12, d0
+	fcpyd	d13, d0
+	fcpyd	d14, d0
+	fcpyd	d15, d0
+
+
+
+	// call inner kernel dsyrk l nt
+	mov		r4, r0 // kp
+	mov		r5, r1 // Ap
+	mov		r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner kernel dsyrk l nt sub
+	mov		r4, r3 // kmax
+	ldr		r5, [fp, #0] // Am
+	ldr		r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for alpha=1.0 and beta=1.0
+	ldr		r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+	// factorization
+	ldr		r4, [fp, #16] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+	// store l
+	ldr		r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_12x4_lib4.S b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..96ff7a4
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,589 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- sda
+// r7   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	add		r8, r5, r6 // A1
+	add		r9, r8, r6 // A2
+
+	// prefetch
+	pld			[r5, #0] // A0
+	pld			[r7, #0] // B
+	pld			[r8, #0] // A1
+	pld			[r9, #0] // A2
+
+	// preload
+	vld1.64		{d0, d1}, [r7:128] // B
+	vld1.64		{d2, d3}, [r5:128] // A0
+	vld1.64		{d4, d5}, [r8:128] // A1
+//	vld1.64		{d6, d7}, [r9:128] // A2
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	pld			[r5, #64] // A0
+	pld			[r7, #64] // B
+	pld			[r8, #64] // A1
+	pld			[r9, #64] // A2
+
+	// main loop
+1:
+	
+	// unroll 0
+	vmla.f32	q4, q1, d0[0]
+	vldr		d6, [r9, #0] // A2
+	vmla.f32	q5, q1, d0[1]
+	vldr		d7, [r9, #8] // A2
+	vmla.f32	q6, q1, d1[0]
+	pld			[r7, #128]
+	vmla.f32	q7, q1, d1[1]
+	vldr		d2, [r5, #16] // A0
+	vmla.f32	q8, q2, d0[0]
+	vldr		d3, [r5, #24] // A0
+	vmla.f32	q9, q2, d0[1]
+	pld			[r5, #128]
+	vmla.f32	q10, q2, d1[0]
+	pld			[r8, #128]
+	vmla.f32	q11, q2, d1[1]
+	vldr		d4, [r7, #16] // B
+	vmla.f32	q12, q3, d0[0]
+	vldr		d5, [r7, #24] // B
+	vmla.f32	q13, q3, d0[1]
+	vldr		d0, [r8, #16] // A1
+	vmla.f32	q14, q3, d1[0]
+	pld			[r9, #128]
+	vmla.f32	q15, q3, d1[1]
+	vldr		d1, [r8, #24] // A1
+
+	// unroll 1
+	vmla.f32	q4, q1, d4[0]
+	vldr		d6, [r9, #16] // A2
+	vmla.f32	q5, q1, d4[1]
+	vldr		d7, [r9, #24] // A2
+	vmla.f32	q6, q1, d5[0]
+	sub		r4, r4, #4
+	vmla.f32	q7, q1, d5[1]
+	vldr		d2, [r5, #32] // A0
+	vmla.f32	q8, q0, d4[0]
+	vldr		d3, [r5, #40] // A0
+	vmla.f32	q9, q0, d4[1]
+	vmla.f32	q10, q0, d5[0]
+	vmla.f32	q11, q0, d5[1]
+	vldr		d0, [r7, #32] // B
+	vmla.f32	q12, q3, d4[0]
+	vldr		d1, [r7, #40] // B
+	vmla.f32	q13, q3, d4[1]
+	vldr		d4, [r8, #32] // A1
+	vmla.f32	q14, q3, d5[0]
+	vmla.f32	q15, q3, d5[1]
+	vldr		d5, [r8, #40] // A1
+
+	// unroll 2
+	vmla.f32	q4, q1, d0[0]
+	vldr		d6, [r9, #32] // A2
+	vmla.f32	q5, q1, d0[1]
+	vldr		d7, [r9, #40] // A2
+	vmla.f32	q6, q1, d1[0]
+	vmla.f32	q7, q1, d1[1]
+	vldr		d2, [r5, #48] // A0
+	vmla.f32	q8, q2, d0[0]
+	vldr		d3, [r5, #56] // A0
+	vmla.f32	q9, q2, d0[1]
+	vmla.f32	q10, q2, d1[0]
+	add		r5, r5, #64
+	vmla.f32	q11, q2, d1[1]
+	vldr		d4, [r7, #48] // B
+	vmla.f32	q12, q3, d0[0]
+	vldr		d5, [r7, #56] // B
+	vmla.f32	q13, q3, d0[1]
+	vldr		d0, [r8, #48] // A1
+	vmla.f32	q14, q3, d1[0]
+	add		r7, r7, #64
+	vmla.f32	q15, q3, d1[1]
+	vldr		d1, [r8, #56] // A1
+
+	// unroll 3
+	vmla.f32	q4, q1, d4[0]
+	vldr		d6, [r9, #48] // A2
+	vmla.f32	q5, q1, d4[1]
+	vldr		d7, [r9, #56] // A2
+	vmla.f32	q6, q1, d5[0]
+	add		r8, r8, #64
+	vmla.f32	q7, q1, d5[1]
+	vldr		d2, [r5, #0] // A0
+	vmla.f32	q8, q0, d4[0]
+	vldr		d3, [r5, #8] // A0
+	vmla.f32	q9, q0, d4[1]
+	add		r9, r9, #64
+	vmla.f32	q10, q0, d5[0]
+	cmp		r4, #4
+	vmla.f32	q11, q0, d5[1]
+	vldr		d0, [r7, #0] // B
+	vmla.f32	q12, q3, d4[0]
+	vldr		d1, [r7, #8] // B
+	vmla.f32	q13, q3, d4[1]
+	vldr		d4, [r8, #0] // A1
+	vmla.f32	q14, q3, d5[0]
+	vmla.f32	q15, q3, d5[1]
+	vldr		d5, [r8, #8] // A1
+
+
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+
+	// unroll 0
+	vmla.f32	q4, q1, d0[0]
+	vldr		d6, [r9, #0] // A2
+	vmla.f32	q5, q1, d0[1]
+	vldr		d7, [r9, #8] // A2
+	vmla.f32	q6, q1, d1[0]
+	pld			[r7, #64]
+	vmla.f32	q7, q1, d1[1]
+	vldr		d2, [r5, #16] // A0
+	vmla.f32	q8, q2, d0[0]
+	vldr		d3, [r5, #24] // A0
+	vmla.f32	q9, q2, d0[1]
+	pld			[r5, #64]
+	vmla.f32	q10, q2, d1[0]
+	pld			[r8, #64]
+	vmla.f32	q11, q2, d1[1]
+	vldr		d4, [r7, #16] // B
+	vmla.f32	q12, q3, d0[0]
+	vldr		d5, [r7, #24] // B
+	vmla.f32	q13, q3, d0[1]
+	vldr		d0, [r8, #16] // A1
+	vmla.f32	q14, q3, d1[0]
+	pld			[r9, #64]
+	vmla.f32	q15, q3, d1[1]
+	vldr		d1, [r8, #24] // A1
+
+	// unroll 1
+	vmla.f32	q4, q1, d4[0]
+	vldr		d6, [r9, #16] // A2
+	vmla.f32	q5, q1, d4[1]
+	vldr		d7, [r9, #24] // A2
+	vmla.f32	q6, q1, d5[0]
+	sub		r4, r4, #4
+	vmla.f32	q7, q1, d5[1]
+	vldr		d2, [r5, #32] // A0
+	vmla.f32	q8, q0, d4[0]
+	vldr		d3, [r5, #40] // A0
+	vmla.f32	q9, q0, d4[1]
+	vmla.f32	q10, q0, d5[0]
+	vmla.f32	q11, q0, d5[1]
+	vldr		d0, [r7, #32] // B
+	vmla.f32	q12, q3, d4[0]
+	vldr		d1, [r7, #40] // B
+	vmla.f32	q13, q3, d4[1]
+	vldr		d4, [r8, #32] // A1
+	vmla.f32	q14, q3, d5[0]
+	vmla.f32	q15, q3, d5[1]
+	vldr		d5, [r8, #40] // A1
+
+	// unroll 2
+	vmla.f32	q4, q1, d0[0]
+	vldr		d6, [r9, #32] // A2
+	vmla.f32	q5, q1, d0[1]
+	vldr		d7, [r9, #40] // A2
+	vmla.f32	q6, q1, d1[0]
+	vmla.f32	q7, q1, d1[1]
+	vldr		d2, [r5, #48] // A0
+	vmla.f32	q8, q2, d0[0]
+	vldr		d3, [r5, #56] // A0
+	vmla.f32	q9, q2, d0[1]
+	vmla.f32	q10, q2, d1[0]
+	add		r5, r5, #64
+	vmla.f32	q11, q2, d1[1]
+	vldr		d4, [r7, #48] // B
+	vmla.f32	q12, q3, d0[0]
+	vldr		d5, [r7, #56] // B
+	vmla.f32	q13, q3, d0[1]
+	vldr		d0, [r8, #48] // A1
+	vmla.f32	q14, q3, d1[0]
+	add		r7, r7, #64
+	vmla.f32	q15, q3, d1[1]
+	vldr		d1, [r8, #56] // A1
+
+	// unroll 3
+	vmla.f32	q4, q1, d4[0]
+	vldr		d6, [r9, #48] // A2
+	vmla.f32	q5, q1, d4[1]
+	vldr		d7, [r9, #56] // A2
+	vmla.f32	q6, q1, d5[0]
+	add		r9, r9, #64
+	vmla.f32	q7, q1, d5[1]
+//	vldr		d2, [r5, #0] // A0
+	vmla.f32	q8, q0, d4[0]
+//	vldr		d3, [r5, #8] // A0
+	vmla.f32	q9, q0, d4[1]
+	vmla.f32	q10, q0, d5[0]
+	add		r8, r8, #64
+	vmla.f32	q11, q0, d5[1]
+//	vldr		d0, [r7, #0] // B
+	vmla.f32	q12, q3, d4[0]
+//	vldr		d1, [r7, #8] // B
+	vmla.f32	q13, q3, d4[1]
+//	vldr		d4, [r8, #0] // A1
+	vmla.f32	q14, q3, d5[0]
+	vmla.f32	q15, q3, d5[1]
+//	vldr		d5, [r8, #8] // A1
+
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+
+3: // clean1-up loop
+
+	// unroll 0
+	vld1.64		{d4, d5}, [r7:128]! // B
+	vld1.64		{d0, d1}, [r5:128]! // A0
+	vmla.f32	q4, q0, d4[0]
+	sub		r4, r4, #1
+	vmla.f32	q5, q0, d4[1]
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+	vld1.64		{d0, d1}, [r8:128]! // A1
+	vmla.f32	q8, q0, d4[0]
+	vmla.f32	q9, q0, d4[1]
+	vmla.f32	q10, q0, d5[0]
+	vmla.f32	q11, q0, d5[1]
+	vld1.64		{d0, d1}, [r8:128]! // A1
+	vmla.f32	q12, q0, d4[0]
+	vmla.f32	q13, q0, d4[1]
+	vmla.f32	q14, q0, d5[0]
+	vmla.f32	q15, q0, d5[1]
+
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- alpha
+// r5   <- beta
+// r6   <- C
+// r7   <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_12X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#endif
+#endif
+
+	flds		s8, [r4, #0] // alpha
+
+	vmul.f32	q4, q4, d4[0]
+	flds		s9, [r5, #0] // beta
+	vmul.f32	q5, q5, d4[0]
+	flds		s10, .LC00 // 0.0
+	vmul.f32	q6, q6, d4[0]
+	vmul.f32	q7, q7, d4[0]
+	fcmpes		s9, s10
+	vmul.f32	q8, q8, d4[0]
+	vmul.f32	q9, q9, d4[0]
+	vmul.f32	q10, q10, d4[0]
+	vmul.f32	q11, q11, d4[0]
+	vmul.f32	q12, q12, d4[0]
+	vmul.f32	q13, q13, d4[0]
+	vmul.f32	q14, q14, d4[0]
+	vmul.f32	q15, q15, d4[0]
+	fmstat
+
+	beq		0f // end
+
+	add		r8, r6, r7
+	add		r9, r8, r7
+
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q4, q0, d4[1]
+	vmla.f32	q5, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q6, q0, d4[1]
+	vmla.f32	q7, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r8:128]!
+	vmla.f32	q8, q0, d4[1]
+	vmla.f32	q9, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r8:128]!
+	vmla.f32	q10, q0, d4[1]
+	vmla.f32	q11, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r9:128]!
+	vmla.f32	q12, q0, d4[1]
+	vmla.f32	q13, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r9:128]!
+	vmla.f32	q14, q0, d4[1]
+	vmla.f32	q15, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- D
+// r5   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_12X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#endif
+#endif
+
+	add		r6, r4, r5
+	add		r7, r6, r5
+
+	vst1.64		{d8, d9, d10, d11}, [r4:128]!
+	vst1.64		{d12, d13, d14, d15}, [r4:128]!
+	vst1.64		{d16, d17, d18, d19}, [r6:128]!
+	vst1.64		{d20, d21, d22, d23}, [r6:128]!
+	vst1.64		{d24, d25, d26, d27}, [r7:128]!
+	vst1.64		{d28, d29, d30, d31}, [r7:128]!
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+	.align 3
+.LC00: // { 0 }
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+//                                r0        r1             r2         r3       sp+0       sp+4          sp+8       sp+12    sp+16   sp+20
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_sgemm_nt_12x4_lib4
+	.type	kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_sgemm_nt_12x4_lib4
+_kernel_sgemm_nt_12x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	vldr	d8, .LC00
+	vldr	d9, .LC00+8
+	vmov	q5, q4
+	vmov	q6, q4
+	vmov	q7, q4
+	vmov	q8, q4
+	vmov	q9, q4
+	vmov	q10, q4
+	vmov	q11, q4
+	vmov	q12, q4
+	vmov	q13, q4
+	vmov	q14, q4
+	vmov	q15, q4
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // sda
+	lsl		r6, r6, #4 // 4*sizeof(float)*sda
+	ldr		r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_gemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #4] // beta
+	ldr		r6, [fp, #8] // C
+	ldr		r7, [fp, #12] // sdc
+	lsl		r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #16] // D
+	ldr		r5, [fp, #20] // sdd
+	lsl		r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_12x4_lib4, .-kernel_sgemm_nt_12x4_lib4
+#endif
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_4x4_lib4.S b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..e8a2e71
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,675 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vld1.64		{d4, d5}, [r6:128]! // B
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, #64]
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d4[1]
+	vld1.64		{d6, d7}, [r6:128]! // B
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+
+	// unroll 1
+	vmla.f32	q4, q1, d6[0]
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q5, q1, d6[1]
+	vld1.64		{d4, d5}, [r6:128]! // B
+	vmla.f32	q6, q1, d7[0]
+	vmla.f32	q7, q1, d7[1]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d4[1]
+	vld1.64		{d6, d7}, [r6:128]! // B
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+
+	// unroll 3
+	vmla.f32	q4, q1, d6[0]
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q5, q1, d6[1]
+	vld1.64		{d4, d5}, [r6:128]! // B
+	vmla.f32	q6, q1, d7[0]
+	vmla.f32	q7, q1, d7[1]
+
+	sub		r4, r4, #4
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d4[1]
+	vld1.64		{d6, d7}, [r6:128]! // B
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+
+	// unroll 1
+	vmla.f32	q4, q1, d6[0]
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q5, q1, d6[1]
+	vld1.64		{d4, d5}, [r6:128]! // B
+	vmla.f32	q6, q1, d7[0]
+	vmla.f32	q7, q1, d7[1]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d4[1]
+	vld1.64		{d6, d7}, [r6:128]! // B
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+
+	// unroll 3
+	vmla.f32	q4, q1, d6[0]
+//	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q5, q1, d6[1]
+//	vld1.64		{d4, d5}, [r6:128]! // B
+	vmla.f32	q6, q1, d7[0]
+	vmla.f32	q7, q1, d7[1]
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+	sub		r5, r5, #16
+	sub		r6, r6, #16
+
+3: // clean1-up loop
+
+	// unroll 0
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vld1.64		{d4, d5}, [r6:128]! // B
+	vmla.f32	q4, q0, d4[0]
+	vmla.f32	q5, q0, d4[1]
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+// r7   <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vldr		d4, [r6, #0]   // B[0,1]
+	vldr		d5, [r6, #16]  // B[4,5]
+	vldr		d6, [r6, #32]  // B[8,9]
+	vldr		d7, [r6, #48]  // B[12,13]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, r7]
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 1
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #8]  // B[2,3]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #24] // B[6,7]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #40] // B[10,11]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #56] // B[14,15]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	add		r6, r6, r7
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 3
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #0]   // B[0,1]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #16]  // B[4,5]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #32]  // B[8,9]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #48]  // B[12,13]
+
+	sub		r4, r4, #4
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 1
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #8]  // B[2,3]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #24] // B[6,7]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #40] // B[10,11]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #56] // B[14,15]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	add		r6, r6, r7
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 3
+//	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+//	vldr		d4, [r6, #0]   // B[0,1]
+	vmla.f32	q5, q1, d5[1]
+//	vldr		d5, [r6, #16]  // B[4,5]
+	vmla.f32	q6, q1, d6[1]
+//	vldr		d6, [r6, #32]  // B[8,9]
+	vmla.f32	q7, q1, d7[1]
+//	vldr		d7, [r6, #48]  // B[12,13]
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+	sub		r5, r5, #16
+
+3: // clean1-up loop
+
+	// unroll 0
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vldr		s8, [r6, #0]  // B[0]
+	vmla.f32	q4, q0, d4[0]
+	vldr		s8, [r6, #16] // B[4]
+	vmla.f32	q5, q0, d4[0]
+	vldr		s8, [r6, #32] // B[8]
+	vmla.f32	q6, q0, d4[0]
+	vldr		s8, [r6, #48] // B[12]
+	vmla.f32	q7, q0, d4[0]
+
+	sub		r4, r4, #1
+	add		r6, r6, #4
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- alpha
+// r5   <- beta
+// r6   <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+	flds		s8, [r4, #0] // alpha
+
+	vmul.f32	q4, q4, d4[0]
+	flds		s9, [r5, #0] // beta
+	vmul.f32	q5, q5, d4[0]
+	flds		s10, .LC00 // 0.0
+	vmul.f32	q6, q6, d4[0]
+	fcmpes		s9, s10
+	vmul.f32	q7, q7, d4[0]
+	fmstat
+
+	beq		0f // end
+
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q4, q0, d4[1]
+	vmla.f32	q5, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q6, q0, d4[1]
+	vmla.f32	q7, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+	vst1.64		{d8, d9, d10, d11}, [r4:128]!
+	vst1.64		{d12, d13, d14, d15}, [r4:128]!
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+	.align 3
+.LC00: // { 0 }
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+//                               r0        r1             r2         r3         sp+0          sp+4       sp+8
+// void kernel_sgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_sgemm_nt_4x4_lib4
+	.type	kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_sgemm_nt_4x4_lib4
+_kernel_sgemm_nt_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	vldr	d8, .LC00
+	vldr	d9, .LC00+8
+	vmov	q5, q4
+	vmov	q6, q4
+	vmov	q7, q4
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_gemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #0] // beta
+	ldr		r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x4_lib4, .-kernel_sgemm_nt_4x4_lib4
+#endif
+
+
+
+//                               r0        r1             r2         r3         sp+0     sp+4          sp+8       sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_sgemm_nn_4x4_lib4
+	.type	kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	vldr	d8, .LC00
+	vldr	d9, .LC00+8
+	vmov	q5, q4
+	vmov	q6, q4
+	vmov	q7, q4
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // B
+	ldr		r7, [fp, #0] // sdb
+	lsl		r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #4] // beta
+	ldr		r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_8x4_lib4.S b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..f356c9b
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,795 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- sda
+// r7   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	add		r8, r5, r6 // A1
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r7, #0]
+	pld		[r8, #0]
+	pld		[r7, #64]
+
+	// preload
+	vld1.64		{d0, d1}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+	vld1.64		{d2, d3}, [r7:128]! // B
+	vld1.64		{d4, d5}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+	vld1.64		{d6, d7}, [r7:128]! // B
+	vld1.64		{d24, d25}, [r5:128]! // A0
+	vld1.64		{d28, d29}, [r5:128]! // A0
+	vld1.64		{d26, d27}, [r8:128] // A1
+
+	sub		r7, r7, #64
+	sub		r5, r5, #32
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// unroll 0
+	pld		[r5, #64] // A0
+	vmla.f32	q4, q12, d0[0]
+	vldr		d30, [r8, #16] // A1
+	vmla.f32	q5, q12, d0[1]
+	vldr		d31, [r8, #24] // A1
+	vmla.f32	q6, q12, d1[0]
+	pld		[r7, #128] // B
+	vmla.f32	q7, q12, d1[1]
+	vldr		d24, [r5, #32]
+	vmla.f32	q8, q13, d0[0]
+	vldr		d25, [r5, #40]
+	vmla.f32	q9, q13, d0[1]
+	vldr		d0, [r7, #64]
+	vmla.f32	q10, q13, d1[0]
+	pld		[r8, #64] // A1
+	vmla.f32	q11, q13, d1[1]
+	vldr		d1, [r7, #72]
+
+	// unroll 1
+	vmla.f32	q4, q14, d2[0]
+	vldr		d26, [r8, #32] // A1
+	vmla.f32	q5, q14, d2[1]
+	vldr		d27, [r8, #40] // A1
+	vmla.f32	q6, q14, d3[0]
+	vmla.f32	q7, q14, d3[1]
+	vldr		d28, [r5, #48]
+	vmla.f32	q8, q15, d2[0]
+	vldr		d29, [r5, #56]
+	vmla.f32	q9, q15, d2[1]
+	vldr		d2, [r7, #80]
+	vmla.f32	q10, q15, d3[0]
+	add		r5, r5, #64
+	vmla.f32	q11, q15, d3[1]
+	vldr		d3, [r7, #88]
+
+	// unroll 2
+	vmla.f32	q4, q12, d4[0]
+	vldr		d30, [r8, #48] // A1
+	vmla.f32	q5, q12, d4[1]
+	vldr		d31, [r8, #56] // A1
+	vmla.f32	q6, q12, d5[0]
+	add		r7, r7, #64
+	vmla.f32	q7, q12, d5[1]
+	vldr		d24, [r5, #0]
+	vmla.f32	q8, q13, d4[0]
+	vldr		d25, [r5, #8]
+	vmla.f32	q9, q13, d4[1]
+	vldr		d4, [r7, #32]
+	vmla.f32	q10, q13, d5[0]
+	add		r8, r8, #64
+	vmla.f32	q11, q13, d5[1]
+	vldr		d5, [r7, #40]
+
+	// unroll 3
+	vmla.f32	q4, q14, d6[0]
+	vldr		d26, [r8, #0] // A1
+	vmla.f32	q5, q14, d6[1]
+	vldr		d27, [r8, #8] // A1
+	vmla.f32	q6, q14, d7[0]
+	sub		r4, r4, #4
+	vmla.f32	q7, q14, d7[1]
+	vldr		d28, [r5, #16]
+	vmla.f32	q8, q15, d6[0]
+	vldr		d29, [r5, #24]
+	vmla.f32	q9, q15, d6[1]
+	vldr		d6, [r7, #48]
+	vmla.f32	q10, q15, d7[0]
+	vmla.f32	q11, q15, d7[1]
+	vldr		d7, [r7, #56]
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+
+	// unroll 0
+	vmla.f32	q4, q12, d0[0]
+	vldr		d30, [r8, #16] // A1
+	vmla.f32	q5, q12, d0[1]
+	vldr		d31, [r8, #24] // A1
+	vmla.f32	q6, q12, d1[0]
+	vmla.f32	q7, q12, d1[1]
+	vldr		d24, [r5, #32]
+	vmla.f32	q8, q13, d0[0]
+	vldr		d25, [r5, #40]
+	vmla.f32	q9, q13, d0[1]
+//	vldr		d4, [r7, #64]
+	vmla.f32	q10, q13, d1[0]
+	vmla.f32	q11, q13, d1[1]
+//	vldr		d5, [r7, #72]
+
+	// unroll 1
+	vmla.f32	q4, q14, d2[0]
+	vldr		d26, [r8, #32] // A1
+	vmla.f32	q5, q14, d2[1]
+	vldr		d27, [r8, #40] // A1
+	vmla.f32	q6, q14, d3[0]
+	vmla.f32	q7, q14, d3[1]
+	vldr		d28, [r5, #48]
+	vmla.f32	q8, q15, d2[0]
+	vldr		d29, [r5, #56]
+	vmla.f32	q9, q15, d2[1]
+//	vldr		d6, [r7, #80]
+	vmla.f32	q10, q15, d3[0]
+//	add		r5, r5, #64
+	vmla.f32	q11, q15, d3[1]
+//	vldr		d7, [r7, #88]
+
+	// unroll 2
+	vmla.f32	q4, q12, d4[0]
+	vldr		d30, [r8, #48] // A1
+	vmla.f32	q5, q12, d4[1]
+	vldr		d31, [r8, #56] // A1
+	vmla.f32	q6, q12, d5[0]
+//	add		r7, r7, #64
+	vmla.f32	q7, q12, d5[1]
+//	vldr		d24, [r5, #0]
+	vmla.f32	q8, q13, d4[0]
+//	vldr		d25, [r5, #8]
+	vmla.f32	q9, q13, d4[1]
+//	vldr		d4, [r7, #32]
+	vmla.f32	q10, q13, d5[0]
+//	add		r8, r8, #64
+	vmla.f32	q11, q13, d5[1]
+//	vldr		d5, [r7, #40]
+
+	// unroll 3
+	vmla.f32	q4, q14, d6[0]
+//	vldr		d26, [r8, #0] // A1
+	vmla.f32	q5, q14, d6[1]
+//	vldr		d27, [r8, #8] // A1
+	vmla.f32	q6, q14, d7[0]
+	sub		r4, r4, #4
+	vmla.f32	q7, q14, d7[1]
+//	vldr		d28, [r5, #16]
+	vmla.f32	q8, q15, d6[0]
+//	vldr		d29, [r5, #24]
+	vmla.f32	q9, q15, d6[1]
+//	vldr		d6, [r7, #48]
+	vmla.f32	q10, q15, d7[0]
+	vmla.f32	q11, q15, d7[1]
+//	vldr		d7, [r7, #56]
+
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+//	sub		r5, r5, #32 // A0
+//	sub		r7, r7, #32 // B
+//	sub		r8, r8, #16 // A1
+
+3: // clean1-up loop
+
+	// unroll 0
+	vld1.64		{d4, d5}, [r7:128]! // B
+	vld1.64		{d0, d1}, [r5:128]! // A0
+	vmla.f32	q4, q0, d4[0]
+	vmla.f32	q5, q0, d4[1]
+	vmla.f32	q6, q0, d5[0]
+	vmla.f32	q7, q0, d5[1]
+	vld1.64		{d0, d1}, [r8:128]! // A1
+	vmla.f32	q8, q0, d4[0]
+	vmla.f32	q9, q0, d4[1]
+	vmla.f32	q10, q0, d5[0]
+	vmla.f32	q11, q0, d5[1]
+
+	sub		r4, r4, #1
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+// subroutine
+//
+// input arguments:
+// r4   <- k
+// r5   <- A
+// r6   <- B
+// r7   <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+	// early return
+	cmp		r4, #0
+	ble		2f // return
+
+	// prefetch
+	pld		[r5, #0]
+	pld		[r6, #0]
+
+	// preload A
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vldr		d4, [r6, #0]   // B[0,1]
+	vldr		d5, [r6, #16]  // B[4,5]
+	vldr		d6, [r6, #32]  // B[8,9]
+	vldr		d7, [r6, #48]  // B[12,13]
+
+	cmp		r4, #4
+	ble		0f // consider clean up loop
+
+	// main loop
+1:
+	
+	// prefetch
+	pld		[r5, #64]
+	pld		[r6, r7]
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 1
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #8]  // B[2,3]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #24] // B[6,7]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #40] // B[10,11]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #56] // B[14,15]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	add		r6, r6, r7
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 3
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #0]   // B[0,1]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #16]  // B[4,5]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #32]  // B[8,9]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #48]  // B[12,13]
+
+	sub		r4, r4, #4
+
+	cmp		r4, #4
+	bgt		1b
+
+0:
+
+	cmp		r4, #3
+	ble		4f
+
+	// unroll 0
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 1
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+	vldr		d4, [r6, #8]  // B[2,3]
+	vmla.f32	q5, q1, d5[1]
+	vldr		d5, [r6, #24] // B[6,7]
+	vmla.f32	q6, q1, d6[1]
+	vldr		d6, [r6, #40] // B[10,11]
+	vmla.f32	q7, q1, d7[1]
+	vldr		d7, [r6, #56] // B[14,15]
+
+	// unroll 2
+	vmla.f32	q4, q0, d4[0]
+	vld1.64		{d2, d3}, [r5:128]! // A
+	vmla.f32	q5, q0, d5[0]
+	add		r6, r6, r7
+	vmla.f32	q6, q0, d6[0]
+	vmla.f32	q7, q0, d7[0]
+
+	// unroll 3
+//	vld1.64		{d0, d1}, [r5:128]! // A
+	vmla.f32	q4, q1, d4[1]
+//	vldr		d4, [r6, #0]   // B[0,1]
+	vmla.f32	q5, q1, d5[1]
+//	vldr		d5, [r6, #16]  // B[4,5]
+	vmla.f32	q6, q1, d6[1]
+//	vldr		d6, [r6, #32]  // B[8,9]
+	vmla.f32	q7, q1, d7[1]
+//	vldr		d7, [r6, #48]  // B[12,13]
+
+	sub		r4, r4, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		r4, #0
+	ble		2f // return
+
+	sub		r5, r5, #16
+
+3: // clean1-up loop
+
+	// unroll 0
+	vld1.64		{d0, d1}, [r5:128]! // A
+	vldr		s8, [r6, #0]  // B[0]
+	vmla.f32	q4, q0, d4[0]
+	vldr		s8, [r6, #16] // B[4]
+	vmla.f32	q5, q0, d4[0]
+	vldr		s8, [r6, #32] // B[8]
+	vmla.f32	q6, q0, d4[0]
+	vldr		s8, [r6, #48] // B[12]
+	vmla.f32	q7, q0, d4[0]
+
+	sub		r4, r4, #1
+	add		r6, r6, #4
+	cmp		r4, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- alpha
+// r5   <- beta
+// r6   <- C
+// r7   <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+	flds		s8, [r4, #0] // alpha
+
+	vmul.f32	q4, q4, d4[0]
+	flds		s9, [r5, #0] // beta
+	vmul.f32	q5, q5, d4[0]
+	flds		s10, .LC00 // 0.0
+	vmul.f32	q6, q6, d4[0]
+	vmul.f32	q7, q7, d4[0]
+	fcmpes		s9, s10
+	vmul.f32	q8, q8, d4[0]
+	vmul.f32	q9, q9, d4[0]
+	vmul.f32	q10, q10, d4[0]
+	vmul.f32	q11, q11, d4[0]
+	fmstat
+
+	beq		0f // end
+
+	add		r8, r6, r7
+
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q4, q0, d4[1]
+	vmla.f32	q5, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r6:128]!
+	vmla.f32	q6, q0, d4[1]
+	vmla.f32	q7, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r8:128]!
+	vmla.f32	q8, q0, d4[1]
+	vmla.f32	q9, q1, d4[1]
+	vld1.64		{d0, d1, d2, d3}, [r8:128]!
+	vmla.f32	q10, q0, d4[1]
+	vmla.f32	q11, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4   <- D
+// r5   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_8X4_LIB4
+#else
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#endif
+#endif
+
+	add		r6, r4, r5
+
+	vst1.64		{d8, d9, d10, d11}, [r4:128]!
+	vst1.64		{d12, d13, d14, d15}, [r4:128]!
+	vst1.64		{d16, d17, d18, d19}, [r6:128]!
+	vst1.64		{d20, d21, d22, d23}, [r6:128]!
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	mov		pc, lr // return
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+	.align 3
+.LC00: // { 0 }
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
+//                               r0        r1             r2         r3       sp+0       sp+4          sp+8       sp+12    sp+16   sp+20
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_sgemm_nt_8x4_lib4
+	.type	kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_sgemm_nt_8x4_lib4
+_kernel_sgemm_nt_8x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	vldr	d8, .LC00
+	vldr	d9, .LC00+8
+	vmov	q5, q4
+	vmov	q6, q4
+	vmov	q7, q4
+	vmov	q8, q4
+	vmov	q9, q4
+	vmov	q10, q4
+	vmov	q11, q4
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // sda
+	lsl		r6, r6, #4 // 4*sizeof(float)*sda
+	ldr		r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_gemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #4] // beta
+	ldr		r6, [fp, #8] // C
+	ldr		r7, [fp, #12] // sdc
+	lsl		r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #16] // D
+	ldr		r5, [fp, #20] // sdd
+	lsl		r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_lib4, .-kernel_sgemm_nt_8x4_lib4
+#endif
+
+
+
+#if 0
+//                               r0        r1             r2         r3         sp+0     sp+4          sp+8       sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+//	.p2align 4,,15
+#if defined(OS_LINUX)
+	.global	kernel_sgemm_nn_4x4_lib4
+	.type	kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.global	kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+	// prologue
+
+	// save GP registers
+	stmdb	sp!, {r4 - r10, fp, lr} // save registers
+	add		fp, sp, #36 // fp to old sp position
+
+	// save FP registers
+	fstmfdd	sp!, {d8-d15}
+
+
+
+	// zero accumulation registers
+	vldr	d8, .LC00
+	vldr	d9, .LC00+8
+	vmov	q5, q4
+	vmov	q6, q4
+	vmov	q7, q4
+
+
+
+	// call inner kernel dgemm nt
+	mov		r4, r0 // kmax
+	mov		r5, r2 // A
+	mov		r6, r3 // B
+	ldr		r7, [fp, #0] // sdb
+	lsl		r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl	inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	bl	_inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		r4, r1 // alpha
+	ldr		r5, [fp, #4] // beta
+	ldr		r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+	// store n
+	ldr		r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+	bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+	// epilogue
+
+	// load FP registers
+	fldmfdd	sp!, {d8-d15}
+
+	// load GP registers and return
+//	ldmia	sp!, {r4 - r10, fp, lr} // load registers
+//	mov		pc, lr // return
+	ldmia	sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv8a/Makefile b/kernel/armv8a/Makefile
new file mode 100644
index 0000000..75e1faf
--- /dev/null
+++ b/kernel/armv8a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_16x4_lib4.o kernel_sgemm_12x4_lib4.o kernel_sgemm_8x8_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/armv8a/kernel_dgemm_4x4_lib4.S b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..2d43b10
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	add sp, sp, #-(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	// prefetch
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x10, #0]
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// preload
+	ld1   {v24.2d, v25.2d}, [x9], #32
+	ld1   {v28.2d, v29.2d}, [x10], #32
+
+	// prefetch
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x10, #32]
+
+	// main loop
+1:
+	
+	// unroll 0
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	ld1		{v26.2d, v27.2d}, [x9], #32
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	ld1		{v30.2d, v31.2d}, [x10], #32
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	prfm	PLDL1KEEP, [x10, #64]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+	sub		w8, w8, #4
+
+	// unroll 1
+	fmla	v0.2d, v26.2d, v30.2d[0]
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	fmla	v1.2d, v27.2d, v30.2d[0]
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v2.2d, v26.2d, v30.2d[1]
+	fmla	v3.2d, v27.2d, v30.2d[1]
+	fmla	v4.2d, v26.2d, v31.2d[0]
+	fmla	v5.2d, v27.2d, v31.2d[0]
+	fmla	v6.2d, v26.2d, v31.2d[1]
+	fmla	v7.2d, v27.2d, v31.2d[1]
+
+	// unroll 2
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	ld1		{v26.2d, v27.2d}, [x9], #32
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	ld1		{v30.2d, v31.2d}, [x10], #32
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	prfm	PLDL1KEEP, [x10, #64]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	// unroll 3
+	fmla	v0.2d, v26.2d, v30.2d[0]
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	fmla	v1.2d, v27.2d, v30.2d[0]
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v2.2d, v26.2d, v30.2d[1]
+	fmla	v3.2d, v27.2d, v30.2d[1]
+	fmla	v4.2d, v26.2d, v31.2d[0]
+	fmla	v5.2d, v27.2d, v31.2d[0]
+	fmla	v6.2d, v26.2d, v31.2d[1]
+	fmla	v7.2d, v27.2d, v31.2d[1]
+
+	cmp		w8, #4
+	bgt		1b
+
+	sub		x9, x9, #32
+	sub		x10, x10, #32
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	// unroll 0
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	// unroll 1
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	// unroll 2
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	// unroll 3
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	sub		w8, w8, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	// unroll 0
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v24.2d, v28.2d[1]
+	fmla	v3.2d, v25.2d, v28.2d[1]
+	fmla	v4.2d, v24.2d, v29.2d[0]
+	fmla	v5.2d, v25.2d, v29.2d[0]
+	fmla	v6.2d, v24.2d, v29.2d[1]
+	fmla	v7.2d, v25.2d, v29.2d[1]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+	ld1		{v28.2d}, [x8]
+
+	fmul	v0.2d, v0.2d, v28.2d[0]
+	fmul	v1.2d, v1.2d, v28.2d[0]
+	fmul	v2.2d, v2.2d, v28.2d[0]
+	fmul	v3.2d, v3.2d, v28.2d[0]
+	fmul	v4.2d, v4.2d, v28.2d[0]
+	fmul	v5.2d, v5.2d, v28.2d[0]
+	fmul	v6.2d, v6.2d, v28.2d[0]
+	fmul	v7.2d, v7.2d, v28.2d[0]
+
+	ld1		{v28.2d}, [x9]
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v26.2d, v28.2d[0]
+	fmla	v3.2d, v27.2d, v28.2d[0]
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+	fmla	v4.2d, v24.2d, v28.2d[0]
+	fmla	v5.2d, v25.2d, v28.2d[0]
+	fmla	v6.2d, v26.2d, v28.2d[0]
+	fmla	v7.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.align 4
+	.type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+	st1		{v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+	st1		{v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+//                               w0        x1             x2         x3         x4            x5         x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+	.align	4
+	.global	kernel_dgemm_nt_4x4_lib4
+	.type	kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+
+
+
+	// call inner kernel dgemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		x10, x3 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	bl	inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x4 // beta
+	mov		x10, x5 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+	bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+	// store n
+	mov		x8, x6
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+	bl inner_store_4x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
diff --git a/kernel/armv8a/kernel_dgemm_8x4_lib4.S b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..314489d
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,575 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	sub sp, sp, #(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10  <- sda
+// x11  <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	add		x12, x9, x10
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #0]
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x12, #0]
+
+	// preload
+	ldp		d24, d25, [x11], #16
+	ldp		d26, d27, [x11], #16
+	ldp		q16, q17, [x9], #32
+	ldp		q20, q21, [x12], #32
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #32]
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x12, #32]
+
+	// main loop
+1:
+	
+	// unroll 0
+	ldp		d28, d29, [x11], #16
+	fmla	v0.2d, v16.2d, v24.2d[0]
+	fmla	v1.2d, v17.2d, v24.2d[0]
+	ldp		d30, d31, [x11], #16
+	fmla	v2.2d, v16.2d, v25.2d[0]
+	fmla	v3.2d, v17.2d, v25.2d[0]
+	ldr		q18, [x9], #16
+	fmla	v4.2d, v16.2d, v26.2d[0]
+	fmla	v5.2d, v17.2d, v26.2d[0]
+	ldr		q19, [x9], #16
+	fmla	v6.2d, v16.2d, v27.2d[0]
+	fmla	v7.2d, v17.2d, v27.2d[0]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v8.2d, v20.2d, v24.2d[0]
+	fmla	v9.2d, v21.2d, v24.2d[0]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v10.2d, v20.2d, v25.2d[0]
+	fmla	v11.2d, v21.2d, v25.2d[0]
+	ldp		q22, q23, [x12], #32
+	fmla	v12.2d, v20.2d, v26.2d[0]
+	fmla	v13.2d, v21.2d, v26.2d[0]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v14.2d, v20.2d, v27.2d[0]
+	fmla	v15.2d, v21.2d, v27.2d[0]
+
+	// unroll 1
+	ldp		d24, d25, [x11], #16
+	fmla	v0.2d, v18.2d, v28.2d[0]
+	fmla	v1.2d, v19.2d, v28.2d[0]
+	ldp		d26, d27, [x11], #16
+	fmla	v2.2d, v18.2d, v29.2d[0]
+	fmla	v3.2d, v19.2d, v29.2d[0]
+	ldr		q16, [x9], #16
+	fmla	v4.2d, v18.2d, v30.2d[0]
+	fmla	v5.2d, v19.2d, v30.2d[0]
+	ldr		q17, [x9], #16
+	fmla	v6.2d, v18.2d, v31.2d[0]
+	fmla	v7.2d, v19.2d, v31.2d[0]
+	ldr		q20, [x12], #16
+	fmla	v8.2d, v22.2d, v28.2d[0]
+	fmla	v9.2d, v23.2d, v28.2d[0]
+	ldr		q21, [x12], #16
+	fmla	v10.2d, v22.2d, v29.2d[0]
+	fmla	v11.2d, v23.2d, v29.2d[0]
+	sub		w8, w8, #4
+	fmla	v12.2d, v22.2d, v30.2d[0]
+	fmla	v13.2d, v23.2d, v30.2d[0]
+	fmla	v14.2d, v22.2d, v31.2d[0]
+	fmla	v15.2d, v23.2d, v31.2d[0]
+
+	// unroll 2
+	ldp		d28, d29, [x11], #16
+	fmla	v0.2d, v16.2d, v24.2d[0]
+	fmla	v1.2d, v17.2d, v24.2d[0]
+	ldp		d30, d31, [x11], #16
+	fmla	v2.2d, v16.2d, v25.2d[0]
+	fmla	v3.2d, v17.2d, v25.2d[0]
+	ldr		q18, [x9], #16
+	fmla	v4.2d, v16.2d, v26.2d[0]
+	fmla	v5.2d, v17.2d, v26.2d[0]
+	ldr		q19, [x9], #16
+	fmla	v6.2d, v16.2d, v27.2d[0]
+	fmla	v7.2d, v17.2d, v27.2d[0]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v8.2d, v20.2d, v24.2d[0]
+	fmla	v9.2d, v21.2d, v24.2d[0]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v10.2d, v20.2d, v25.2d[0]
+	fmla	v11.2d, v21.2d, v25.2d[0]
+	ldp		q22, q23, [x12], #32
+	fmla	v12.2d, v20.2d, v26.2d[0]
+	fmla	v13.2d, v21.2d, v26.2d[0]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v14.2d, v20.2d, v27.2d[0]
+	fmla	v15.2d, v21.2d, v27.2d[0]
+
+	// unroll 3
+	ldp		d24, d25, [x11], #16
+	fmla	v0.2d, v18.2d, v28.2d[0]
+	fmla	v1.2d, v19.2d, v28.2d[0]
+	ldp		d26, d27, [x11], #16
+	fmla	v2.2d, v18.2d, v29.2d[0]
+	fmla	v3.2d, v19.2d, v29.2d[0]
+	ldr		q16, [x9], #16
+	fmla	v4.2d, v18.2d, v30.2d[0]
+	fmla	v5.2d, v19.2d, v30.2d[0]
+	ldr		q17, [x9], #16
+	fmla	v6.2d, v18.2d, v31.2d[0]
+	fmla	v7.2d, v19.2d, v31.2d[0]
+	ldr		q20, [x12], #16
+	fmla	v8.2d, v22.2d, v28.2d[0]
+	fmla	v9.2d, v23.2d, v28.2d[0]
+	ldr		q21, [x12], #16
+	fmla	v10.2d, v22.2d, v29.2d[0]
+	fmla	v11.2d, v23.2d, v29.2d[0]
+	cmp		w8, #4
+	fmla	v12.2d, v22.2d, v30.2d[0]
+	fmla	v13.2d, v23.2d, v30.2d[0]
+	fmla	v14.2d, v22.2d, v31.2d[0]
+	fmla	v15.2d, v23.2d, v31.2d[0]
+
+	bgt		1b
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	
+	// unroll 0
+	ldp		d28, d29, [x11], #16
+	fmla	v0.2d, v16.2d, v24.2d[0]
+	fmla	v1.2d, v17.2d, v24.2d[0]
+	ldp		d30, d31, [x11], #16
+	fmla	v2.2d, v16.2d, v25.2d[0]
+	fmla	v3.2d, v17.2d, v25.2d[0]
+	ldr		q18, [x9], #16
+	fmla	v4.2d, v16.2d, v26.2d[0]
+	fmla	v5.2d, v17.2d, v26.2d[0]
+	ldr		q19, [x9], #16
+	fmla	v6.2d, v16.2d, v27.2d[0]
+	fmla	v7.2d, v17.2d, v27.2d[0]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v8.2d, v20.2d, v24.2d[0]
+	fmla	v9.2d, v21.2d, v24.2d[0]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v10.2d, v20.2d, v25.2d[0]
+	fmla	v11.2d, v21.2d, v25.2d[0]
+	ldp		q22, q23, [x12], #32
+	fmla	v12.2d, v20.2d, v26.2d[0]
+	fmla	v13.2d, v21.2d, v26.2d[0]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v14.2d, v20.2d, v27.2d[0]
+	fmla	v15.2d, v21.2d, v27.2d[0]
+
+	// unroll 1
+	ldp		d24, d25, [x11], #16
+	fmla	v0.2d, v18.2d, v28.2d[0]
+	fmla	v1.2d, v19.2d, v28.2d[0]
+	ldp		d26, d27, [x11], #16
+	fmla	v2.2d, v18.2d, v29.2d[0]
+	fmla	v3.2d, v19.2d, v29.2d[0]
+	ldr		q16, [x9], #16
+	fmla	v4.2d, v18.2d, v30.2d[0]
+	fmla	v5.2d, v19.2d, v30.2d[0]
+	ldr		q17, [x9], #16
+	fmla	v6.2d, v18.2d, v31.2d[0]
+	fmla	v7.2d, v19.2d, v31.2d[0]
+	ldr		q20, [x12], #16
+	fmla	v8.2d, v22.2d, v28.2d[0]
+	fmla	v9.2d, v23.2d, v28.2d[0]
+	ldr		q21, [x12], #16
+	fmla	v10.2d, v22.2d, v29.2d[0]
+	fmla	v11.2d, v23.2d, v29.2d[0]
+	sub		w8, w8, #4
+	fmla	v12.2d, v22.2d, v30.2d[0]
+	fmla	v13.2d, v23.2d, v30.2d[0]
+	fmla	v14.2d, v22.2d, v31.2d[0]
+	fmla	v15.2d, v23.2d, v31.2d[0]
+
+	// unroll 2
+	ldp		d28, d29, [x11], #16
+	fmla	v0.2d, v16.2d, v24.2d[0]
+	fmla	v1.2d, v17.2d, v24.2d[0]
+	ldp		d30, d31, [x11], #16
+	fmla	v2.2d, v16.2d, v25.2d[0]
+	fmla	v3.2d, v17.2d, v25.2d[0]
+	ldr		q18, [x9], #16
+	fmla	v4.2d, v16.2d, v26.2d[0]
+	fmla	v5.2d, v17.2d, v26.2d[0]
+	ldr		q19, [x9], #16
+	fmla	v6.2d, v16.2d, v27.2d[0]
+	fmla	v7.2d, v17.2d, v27.2d[0]
+//	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v8.2d, v20.2d, v24.2d[0]
+	fmla	v9.2d, v21.2d, v24.2d[0]
+//	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v10.2d, v20.2d, v25.2d[0]
+	fmla	v11.2d, v21.2d, v25.2d[0]
+	ldp		q22, q23, [x12], #32
+	fmla	v12.2d, v20.2d, v26.2d[0]
+	fmla	v13.2d, v21.2d, v26.2d[0]
+//	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v14.2d, v20.2d, v27.2d[0]
+	fmla	v15.2d, v21.2d, v27.2d[0]
+
+	// unroll 3
+//	ldp		d24, d25, [x11], #16
+	fmla	v0.2d, v18.2d, v28.2d[0]
+	fmla	v1.2d, v19.2d, v28.2d[0]
+//	ldp		d26, d27, [x11], #16
+	fmla	v2.2d, v18.2d, v29.2d[0]
+	fmla	v3.2d, v19.2d, v29.2d[0]
+//	ldr		q16, [x9], #16
+	fmla	v4.2d, v18.2d, v30.2d[0]
+	fmla	v5.2d, v19.2d, v30.2d[0]
+//	ldr		q17, [x9], #16
+	fmla	v6.2d, v18.2d, v31.2d[0]
+	fmla	v7.2d, v19.2d, v31.2d[0]
+//	ldr		q20, [x12], #16
+	fmla	v8.2d, v22.2d, v28.2d[0]
+	fmla	v9.2d, v23.2d, v28.2d[0]
+//	ldr		q21, [x12], #16
+	fmla	v10.2d, v22.2d, v29.2d[0]
+	fmla	v11.2d, v23.2d, v29.2d[0]
+//	cmp		w8, #4
+	fmla	v12.2d, v22.2d, v30.2d[0]
+	fmla	v13.2d, v23.2d, v30.2d[0]
+	fmla	v14.2d, v22.2d, v31.2d[0]
+	fmla	v15.2d, v23.2d, v31.2d[0]
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+	sub		x9, x9, #32
+	sub		x11, x11, #32
+	sub		x12, x12, #32
+
+3: // clean1-up loop
+
+	// unroll 0
+	ld1		{v20.2d, v21.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x11], #32
+	fmla	v0.2d, v20.2d, v28.2d[0]
+	fmla	v1.2d, v21.2d, v28.2d[0]
+	fmla	v2.2d, v20.2d, v28.2d[1]
+	fmla	v3.2d, v21.2d, v28.2d[1]
+	fmla	v4.2d, v20.2d, v29.2d[0]
+	fmla	v5.2d, v21.2d, v29.2d[0]
+	fmla	v6.2d, v20.2d, v29.2d[1]
+	fmla	v7.2d, v21.2d, v29.2d[1]
+	ld1		{v22.2d, v23.2d}, [x12], #32
+	fmla	v8.2d, v22.2d, v28.2d[0]
+	fmla	v9.2d, v23.2d, v28.2d[0]
+	fmla	v10.2d, v22.2d, v28.2d[1]
+	fmla	v11.2d, v23.2d, v28.2d[1]
+	fmla	v12.2d, v22.2d, v29.2d[0]
+	fmla	v13.2d, v23.2d, v29.2d[0]
+	fmla	v14.2d, v22.2d, v29.2d[1]
+	fmla	v15.2d, v23.2d, v29.2d[1]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+// x11  <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+	ld1		{v28.2d}, [x8]
+
+	fmul	v0.2d, v0.2d, v28.2d[0]
+	fmul	v1.2d, v1.2d, v28.2d[0]
+	fmul	v2.2d, v2.2d, v28.2d[0]
+	fmul	v3.2d, v3.2d, v28.2d[0]
+	fmul	v4.2d, v4.2d, v28.2d[0]
+	fmul	v5.2d, v5.2d, v28.2d[0]
+	fmul	v6.2d, v6.2d, v28.2d[0]
+	fmul	v7.2d, v7.2d, v28.2d[0]
+	fmul	v8.2d, v8.2d, v28.2d[0]
+	fmul	v9.2d, v9.2d, v28.2d[0]
+	fmul	v10.2d, v10.2d, v28.2d[0]
+	fmul	v11.2d, v11.2d, v28.2d[0]
+	fmul	v12.2d, v12.2d, v28.2d[0]
+	fmul	v13.2d, v13.2d, v28.2d[0]
+	fmul	v14.2d, v14.2d, v28.2d[0]
+	fmul	v15.2d, v15.2d, v28.2d[0]
+
+	ld1		{v28.2d}, [x9]
+
+	add		x12, x10, x11
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+	fmla	v0.2d, v24.2d, v28.2d[0]
+	fmla	v1.2d, v25.2d, v28.2d[0]
+	fmla	v2.2d, v26.2d, v28.2d[0]
+	fmla	v3.2d, v27.2d, v28.2d[0]
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+	fmla	v4.2d, v24.2d, v28.2d[0]
+	fmla	v5.2d, v25.2d, v28.2d[0]
+	fmla	v6.2d, v26.2d, v28.2d[0]
+	fmla	v7.2d, v27.2d, v28.2d[0]
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+	fmla	v8.2d, v24.2d, v28.2d[0]
+	fmla	v9.2d, v25.2d, v28.2d[0]
+	fmla	v10.2d, v26.2d, v28.2d[0]
+	fmla	v11.2d, v27.2d, v28.2d[0]
+
+	ld1		{v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+	fmla	v12.2d, v24.2d, v28.2d[0]
+	fmla	v13.2d, v25.2d, v28.2d[0]
+	fmla	v14.2d, v26.2d, v28.2d[0]
+	fmla	v15.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+// x9   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_8X4_LIB4
+#else
+	.align 4
+	.type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+	add		x10, x8, x9
+
+	st1		{v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+	st1		{v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+	st1		{v8.2d, v9.2d, v10.2d, v11.2d}, [x10], #64
+	st1		{v12.2d, v13.2d, v14.2d, v15.2d}, [x10], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+//                               w0        x1             x2         w3       x4         x5            x6         w7       sp+0       sp+8
+// void kernel_dgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+	.align	4
+	.global	kernel_dgemm_nt_8x4_lib4
+	.type	kernel_dgemm_nt_8x4_lib4, %function
+kernel_dgemm_nt_8x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+	fmov    d8, d0
+	fmov    d9, d0
+	fmov    d10, d0
+	fmov    d11, d0
+	fmov    d12, d0
+	fmov    d13, d0
+	fmov    d14, d0
+	fmov    d15, d0
+
+
+
+	// call inner kernel gemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		w10, w3 // sda
+	lsl		w10, w10, #5 // 32*sda
+	mov		x11, x4 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x5 // beta
+	mov		x10, x6 // C
+	mov		w11, w7 // C
+	lsl		w11, w11, #5 // 32*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+	bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+	// store n
+	ldr		x8, [sp, #(STACKSIZE + 0)] // D
+	ldr		w9, [sp, #(STACKSIZE + 8)] // sdd
+	lsl		w9, w9, #5 // 32*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+	bl inner_store_8x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_12x4_lib4.S b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..ab66cad
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	sub sp, sp, #(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10  <- sda
+// x11  <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	add		x12, x9, x10
+	add		x13, x12, x10
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #0]
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x12, #0]
+	prfm	PLDL1KEEP, [x13, #0]
+
+	// preload
+	ld1		{v24.4s, v25.4s}, [x9], #32
+	ld1		{v28.4s, v29.4s}, [x11], #32
+	ld1		{v20.4s, v21.4s}, [x12], #32
+	ld1		{v16.4s, v17.4s}, [x13], #32
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #32]
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x12, #32]
+	prfm	PLDL1KEEP, [x13, #32]
+
+	// main loop
+1:
+
+	// unroll 0
+	ld1		{v26.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v27.4s}, [x9], #16
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v30.4s}, [x11], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	ld1		{v31.4s}, [x11], #16
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v22.4s}, [x12], #16
+	fmla	v8.4s, v16.4s, v28.4s[0]
+	fmla	v9.4s, v16.4s, v28.4s[1]
+	ld1		{v23.4s}, [x12], #16
+	fmla	v10.4s, v16.4s, v28.4s[2]
+	fmla	v11.4s, v16.4s, v28.4s[3]
+
+	// unroll 1
+	ld1		{v18.4s}, [x13], #16
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	ld1		{v19.4s}, [x13], #16
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v8.4s, v17.4s, v29.4s[0]
+	fmla	v9.4s, v17.4s, v29.4s[1]
+	sub		w8, w8, #4
+	fmla	v10.4s, v17.4s, v29.4s[2]
+	fmla	v11.4s, v17.4s, v29.4s[3]
+
+	// unroll 2
+	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	fmla	v1.4s, v26.4s, v30.4s[1]
+	ld1		{v25.4s}, [x9], #16
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	fmla	v3.4s, v26.4s, v30.4s[3]
+	ld1		{v28.4s}, [x11], #16
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+	ld1		{v29.4s}, [x11], #16
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+	ld1		{v20.4s}, [x12], #16
+	fmla	v8.4s, v18.4s, v30.4s[0]
+	fmla	v9.4s, v18.4s, v30.4s[1]
+	ld1		{v21.4s}, [x12], #16
+	fmla	v10.4s, v18.4s, v30.4s[2]
+	fmla	v11.4s, v18.4s, v30.4s[3]
+
+	// unroll 3
+	ld1		{v16.4s}, [x13], #16
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	ld1		{v17.4s}, [x13], #16
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	cmp		w8, #4
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+	fmla	v8.4s, v19.4s, v31.4s[0]
+	fmla	v9.4s, v19.4s, v31.4s[1]
+	fmla	v10.4s, v19.4s, v31.4s[2]
+	fmla	v11.4s, v19.4s, v31.4s[3]
+
+	bgt		1b
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	// unroll 0
+	ld1		{v26.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v27.4s}, [x9], #16
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v30.4s}, [x11], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	ld1		{v31.4s}, [x11], #16
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v22.4s}, [x12], #16
+	fmla	v8.4s, v16.4s, v28.4s[0]
+	fmla	v9.4s, v16.4s, v28.4s[1]
+	ld1		{v23.4s}, [x12], #16
+	fmla	v10.4s, v16.4s, v28.4s[2]
+	fmla	v11.4s, v16.4s, v28.4s[3]
+
+	// unroll 1
+	ld1		{v18.4s}, [x13], #16
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	ld1		{v19.4s}, [x13], #16
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+//	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+//	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+//	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v8.4s, v17.4s, v29.4s[0]
+	fmla	v9.4s, v17.4s, v29.4s[1]
+	sub		w8, w8, #4
+	fmla	v10.4s, v17.4s, v29.4s[2]
+	fmla	v11.4s, v17.4s, v29.4s[3]
+
+	// unroll 2
+//	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	fmla	v1.4s, v26.4s, v30.4s[1]
+//	ld1		{v25.4s}, [x9], #16
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	fmla	v3.4s, v26.4s, v30.4s[3]
+//	ld1		{v28.4s}, [x11], #16
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+//	ld1		{v29.4s}, [x11], #16
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+//	ld1		{v20.4s}, [x12], #16
+	fmla	v8.4s, v18.4s, v30.4s[0]
+	fmla	v9.4s, v18.4s, v30.4s[1]
+//	ld1		{v21.4s}, [x12], #16
+	fmla	v10.4s, v18.4s, v30.4s[2]
+	fmla	v11.4s, v18.4s, v30.4s[3]
+
+	// unroll 3
+//	ld1		{v16.4s}, [x13], #16
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+//	ld1		{v17.4s}, [x13], #16
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	cmp		w8, #4
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+	fmla	v8.4s, v19.4s, v31.4s[0]
+	fmla	v9.4s, v19.4s, v31.4s[1]
+	fmla	v10.4s, v19.4s, v31.4s[2]
+	fmla	v11.4s, v19.4s, v31.4s[3]
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+	sub		x9, x9, #32
+	sub		x12, x12, #32
+	sub		x11, x11, #32
+	sub		x13, x13, #32
+
+3: // clean1-up loop
+
+	// unroll 0
+
+	ld1		{v28.4s}, [x11], #16
+	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v20.4s}, [x12], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v16.4s}, [x13], #16
+	fmla	v8.4s, v16.4s, v28.4s[0]
+	fmla	v9.4s, v16.4s, v28.4s[1]
+	fmla	v10.4s, v16.4s, v28.4s[2]
+	fmla	v11.4s, v16.4s, v28.4s[3]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+// x11  <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_12X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#endif
+
+	ld1		{v28.4s}, [x8]
+
+	fmul	v0.4s, v0.4s, v28.4s[0]
+	fmul	v1.4s, v1.4s, v28.4s[0]
+	fmul	v2.4s, v2.4s, v28.4s[0]
+	fmul	v3.4s, v3.4s, v28.4s[0]
+	fmul	v4.4s, v4.4s, v28.4s[0]
+	fmul	v5.4s, v5.4s, v28.4s[0]
+	fmul	v6.4s, v6.4s, v28.4s[0]
+	fmul	v7.4s, v7.4s, v28.4s[0]
+	fmul	v8.4s, v8.4s, v28.4s[0]
+	fmul	v9.4s, v9.4s, v28.4s[0]
+	fmul	v10.4s, v10.4s, v28.4s[0]
+	fmul	v11.4s, v11.4s, v28.4s[0]
+
+	ld1		{v28.4s}, [x9]
+
+	add		x12, x10, x11
+	add		x13, x12, x11
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v25.4s, v28.4s[0]
+	fmla	v2.4s, v26.4s, v28.4s[0]
+	fmla	v3.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+	fmla	v4.4s, v24.4s, v28.4s[0]
+	fmla	v5.4s, v25.4s, v28.4s[0]
+	fmla	v6.4s, v26.4s, v28.4s[0]
+	fmla	v7.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+	fmla	v8.4s, v24.4s, v28.4s[0]
+	fmla	v9.4s, v25.4s, v28.4s[0]
+	fmla	v10.4s, v26.4s, v28.4s[0]
+	fmla	v11.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+// x9   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_12X4_LIB4
+#else
+	.align 4
+	.type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#endif
+
+	add		x10, x8, x9
+	add		x11, x10, x9
+
+	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+	st1		{v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+	st1		{v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+
+
+
+
+
+//                                w0        x1             x2         w3       x4         x5            x6         w7       sp+0       sp+8
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+	.align	4
+	.global	kernel_sgemm_nt_12x4_lib4
+	.type	kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+	fmov    d8, d0
+	fmov    d9, d0
+	fmov    d10, d0
+	fmov    d11, d0
+
+
+
+	// call inner kernel gemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		w10, w3 // sda
+	lsl		w10, w10, #4 // 16*sda
+	mov		x11, x4 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x5 // beta
+	mov		x10, x6 // C
+	mov		w11, w7 // C
+	lsl		w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+	bl inner_scale_ab_12x4_lib4
+#endif
+
+
+
+	// store n
+	ldr		x8, [sp, #(STACKSIZE + 0)] // D
+	ldr		w9, [sp, #(STACKSIZE + 8)] // sdd
+	lsl		w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+	bl inner_store_12x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_16x4_lib4.S b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
new file mode 100644
index 0000000..edc06ac
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
@@ -0,0 +1,600 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	sub sp, sp, #(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10  <- sda
+// x11  <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_16x4_lib4, %function
+inner_kernel_gemm_add_nt_16x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	add		x12, x9, x10
+	add		x13, x12, x10
+	add		x14, x13, x10
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #0]
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x12, #0]
+	prfm	PLDL1KEEP, [x13, #0]
+	prfm	PLDL1KEEP, [x14, #0]
+
+	// preload
+	ldp		s24, s25, [x11], #8
+	ldp		s26, s27, [x11], #8
+	ldr		q16, [x9], #16
+	ldr		q17, [x12], #16
+	ldr		q18, [x13], #16
+	ldr		q19, [x14], #16
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #32]
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x12, #32]
+	prfm	PLDL1KEEP, [x13, #32]
+	prfm	PLDL1KEEP, [x14, #32]
+
+	// main loop
+1:
+	
+	// unroll 0
+	ldp		s28, s29, [x11], #8
+	fmla	v0.4s, v16.4s, v24.4s[0]
+	fmla	v1.4s, v16.4s, v25.4s[0]
+	ldp		s30, s31, [x11], #8
+	fmla	v2.4s, v16.4s, v26.4s[0]
+	fmla	v3.4s, v16.4s, v27.4s[0]
+	ldr		q20, [x9], #16
+	fmla	v4.4s, v17.4s, v24.4s[0]
+	fmla	v5.4s, v17.4s, v25.4s[0]
+	ldr		q21, [x12], #16
+	fmla	v6.4s, v17.4s, v26.4s[0]
+	fmla	v7.4s, v17.4s, v27.4s[0]
+	ldr		q22, [x13], #16
+	fmla	v8.4s, v18.4s, v24.4s[0]
+	fmla	v9.4s, v18.4s, v25.4s[0]
+	ldr		q23, [x14], #16
+	fmla	v10.4s, v18.4s, v26.4s[0]
+	fmla	v11.4s, v18.4s, v27.4s[0]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v12.4s, v19.4s, v24.4s[0]
+	fmla	v13.4s, v19.4s, v25.4s[0]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v14.4s, v19.4s, v26.4s[0]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v15.4s, v19.4s, v27.4s[0]
+
+
+	// unroll 1
+	ldp		s24, s25, [x11], #8
+	fmla	v0.4s, v20.4s, v28.4s[0]
+	fmla	v1.4s, v20.4s, v29.4s[0]
+	ldp		s26, s27, [x11], #8
+	fmla	v2.4s, v20.4s, v30.4s[0]
+	fmla	v3.4s, v20.4s, v31.4s[0]
+	ldr		q16, [x9], #16
+	fmla	v4.4s, v21.4s, v28.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[0]
+	ldr		q17, [x12], #16
+	fmla	v6.4s, v21.4s, v30.4s[0]
+	fmla	v7.4s, v21.4s, v31.4s[0]
+	ldr		q18, [x13], #16
+	fmla	v8.4s, v22.4s, v28.4s[0]
+	fmla	v9.4s, v22.4s, v29.4s[0]
+	ldr		q19, [x14], #16
+	fmla	v10.4s, v22.4s, v30.4s[0]
+	fmla	v11.4s, v22.4s, v31.4s[0]
+	prfm	PLDL1KEEP, [x13, #32]
+	fmla	v12.4s, v23.4s, v28.4s[0]
+	fmla	v13.4s, v23.4s, v29.4s[0]
+	prfm	PLDL1KEEP, [x14, #32]
+	fmla	v14.4s, v23.4s, v30.4s[0]
+	fmla	v15.4s, v23.4s, v31.4s[0]
+
+	// unroll 2
+	ldp		s28, s29, [x11], #8
+	fmla	v0.4s, v16.4s, v24.4s[0]
+	fmla	v1.4s, v16.4s, v25.4s[0]
+	ldp		s30, s31, [x11], #8
+	fmla	v2.4s, v16.4s, v26.4s[0]
+	fmla	v3.4s, v16.4s, v27.4s[0]
+	ldr		q20, [x9], #16
+	fmla	v4.4s, v17.4s, v24.4s[0]
+	fmla	v5.4s, v17.4s, v25.4s[0]
+	ldr		q21, [x12], #16
+	fmla	v6.4s, v17.4s, v26.4s[0]
+	fmla	v7.4s, v17.4s, v27.4s[0]
+	ldr		q22, [x13], #16
+	fmla	v8.4s, v18.4s, v24.4s[0]
+	fmla	v9.4s, v18.4s, v25.4s[0]
+	ldr		q23, [x14], #16
+	fmla	v10.4s, v18.4s, v26.4s[0]
+	fmla	v11.4s, v18.4s, v27.4s[0]
+	fmla	v12.4s, v19.4s, v24.4s[0]
+	fmla	v13.4s, v19.4s, v25.4s[0]
+	fmla	v14.4s, v19.4s, v26.4s[0]
+	fmla	v15.4s, v19.4s, v27.4s[0]
+
+
+	// unroll 3
+	ldp		s24, s25, [x11], #8
+	fmla	v0.4s, v20.4s, v28.4s[0]
+	fmla	v1.4s, v20.4s, v29.4s[0]
+	ldp		s26, s27, [x11], #8
+	fmla	v2.4s, v20.4s, v30.4s[0]
+	fmla	v3.4s, v20.4s, v31.4s[0]
+	ldr		q16, [x9], #16
+	fmla	v4.4s, v21.4s, v28.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[0]
+	ldr		q17, [x12], #16
+	fmla	v6.4s, v21.4s, v30.4s[0]
+	fmla	v7.4s, v21.4s, v31.4s[0]
+	ldr		q18, [x13], #16
+	fmla	v8.4s, v22.4s, v28.4s[0]
+	fmla	v9.4s, v22.4s, v29.4s[0]
+	ldr		q19, [x14], #16
+	fmla	v10.4s, v22.4s, v30.4s[0]
+	fmla	v11.4s, v22.4s, v31.4s[0]
+	sub		w8, w8, #4
+	fmla	v12.4s, v23.4s, v28.4s[0]
+	fmla	v13.4s, v23.4s, v29.4s[0]
+	cmp		w8, #4
+	fmla	v14.4s, v23.4s, v30.4s[0]
+	fmla	v15.4s, v23.4s, v31.4s[0]
+
+	bgt		1b
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	
+	// unroll 0
+	ldp		s28, s29, [x11], #8
+	fmla	v0.4s, v16.4s, v24.4s[0]
+	fmla	v1.4s, v16.4s, v25.4s[0]
+	ldp		s30, s31, [x11], #8
+	fmla	v2.4s, v16.4s, v26.4s[0]
+	fmla	v3.4s, v16.4s, v27.4s[0]
+	ldr		q20, [x9], #16
+	fmla	v4.4s, v17.4s, v24.4s[0]
+	fmla	v5.4s, v17.4s, v25.4s[0]
+	ldr		q21, [x12], #16
+	fmla	v6.4s, v17.4s, v26.4s[0]
+	fmla	v7.4s, v17.4s, v27.4s[0]
+	ldr		q22, [x13], #16
+	fmla	v8.4s, v18.4s, v24.4s[0]
+	fmla	v9.4s, v18.4s, v25.4s[0]
+	ldr		q23, [x14], #16
+	fmla	v10.4s, v18.4s, v26.4s[0]
+	fmla	v11.4s, v18.4s, v27.4s[0]
+//	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v12.4s, v19.4s, v24.4s[0]
+	fmla	v13.4s, v19.4s, v25.4s[0]
+//	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v14.4s, v19.4s, v26.4s[0]
+	fmla	v15.4s, v19.4s, v27.4s[0]
+
+
+	// unroll 1
+	ldp		s24, s25, [x11], #8
+	fmla	v0.4s, v20.4s, v28.4s[0]
+	fmla	v1.4s, v20.4s, v29.4s[0]
+	ldp		s26, s27, [x11], #8
+	fmla	v2.4s, v20.4s, v30.4s[0]
+	fmla	v3.4s, v20.4s, v31.4s[0]
+	ldr		q16, [x9], #16
+	fmla	v4.4s, v21.4s, v28.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[0]
+	ldr		q17, [x12], #16
+	fmla	v6.4s, v21.4s, v30.4s[0]
+	fmla	v7.4s, v21.4s, v31.4s[0]
+	ldr		q18, [x13], #16
+	fmla	v8.4s, v22.4s, v28.4s[0]
+	fmla	v9.4s, v22.4s, v29.4s[0]
+	ldr		q19, [x14], #16
+	fmla	v10.4s, v22.4s, v30.4s[0]
+	fmla	v11.4s, v22.4s, v31.4s[0]
+//	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v12.4s, v23.4s, v28.4s[0]
+	fmla	v13.4s, v23.4s, v29.4s[0]
+//	prfm	PLDL1KEEP, [x13, #64]
+	fmla	v14.4s, v23.4s, v30.4s[0]
+	fmla	v15.4s, v23.4s, v31.4s[0]
+
+	// unroll 2
+	ldp		s28, s29, [x11], #8
+	fmla	v0.4s, v16.4s, v24.4s[0]
+	fmla	v1.4s, v16.4s, v25.4s[0]
+	ldp		s30, s31, [x11], #8
+	fmla	v2.4s, v16.4s, v26.4s[0]
+	fmla	v3.4s, v16.4s, v27.4s[0]
+	ldr		q20, [x9], #16
+	fmla	v4.4s, v17.4s, v24.4s[0]
+	fmla	v5.4s, v17.4s, v25.4s[0]
+	ldr		q21, [x12], #16
+	fmla	v6.4s, v17.4s, v26.4s[0]
+	fmla	v7.4s, v17.4s, v27.4s[0]
+	ldr		q22, [x13], #16
+	fmla	v8.4s, v18.4s, v24.4s[0]
+	fmla	v9.4s, v18.4s, v25.4s[0]
+	ldr		q23, [x14], #16
+	fmla	v10.4s, v18.4s, v26.4s[0]
+	fmla	v11.4s, v18.4s, v27.4s[0]
+//	prfm	PLDL1KEEP, [x14, #64]
+	fmla	v12.4s, v19.4s, v24.4s[0]
+	fmla	v13.4s, v19.4s, v25.4s[0]
+	fmla	v14.4s, v19.4s, v26.4s[0]
+	fmla	v15.4s, v19.4s, v27.4s[0]
+
+
+	// unroll 3
+	ldp		s24, s25, [x11], #8
+	fmla	v0.4s, v20.4s, v28.4s[0]
+	fmla	v1.4s, v20.4s, v29.4s[0]
+	ldp		s26, s27, [x11], #8
+	fmla	v2.4s, v20.4s, v30.4s[0]
+	fmla	v3.4s, v20.4s, v31.4s[0]
+	ldr		q16, [x9], #16
+	fmla	v4.4s, v21.4s, v28.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[0]
+	ldr		q17, [x12], #16
+	fmla	v6.4s, v21.4s, v30.4s[0]
+	fmla	v7.4s, v21.4s, v31.4s[0]
+	ldr		q18, [x13], #16
+	fmla	v8.4s, v22.4s, v28.4s[0]
+	fmla	v9.4s, v22.4s, v29.4s[0]
+	ldr		q19, [x14], #16
+	fmla	v10.4s, v22.4s, v30.4s[0]
+	fmla	v11.4s, v22.4s, v31.4s[0]
+//	sub		w8, w8, #4
+	fmla	v12.4s, v23.4s, v28.4s[0]
+	fmla	v13.4s, v23.4s, v29.4s[0]
+//	cmp		w8, #4
+	fmla	v14.4s, v23.4s, v30.4s[0]
+	fmla	v15.4s, v23.4s, v31.4s[0]
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+	sub		x9, x9, #16
+	sub		x11, x11, #16
+	sub		x12, x12, #16
+	sub		x13, x13, #16
+	sub		x14, x14, #16
+
+3: // clean1-up loop
+
+	// unroll 0
+	// TODO
+	ldp		s24, s25, [x11], #8
+	ldr		q16, [x9], #16
+	fmla	v0.4s, v16.4s, v24.4s[0]
+	fmla	v1.4s, v16.4s, v25.4s[0]
+	ldp		s26, s27, [x11], #8
+	fmla	v2.4s, v16.4s, v26.4s[0]
+	fmla	v3.4s, v16.4s, v27.4s[0]
+	ldr		q17, [x12], #16
+	fmla	v4.4s, v17.4s, v24.4s[0]
+	fmla	v5.4s, v17.4s, v25.4s[0]
+	fmla	v6.4s, v17.4s, v26.4s[0]
+	fmla	v7.4s, v17.4s, v27.4s[0]
+	ldr		q18, [x13], #16
+	fmla	v8.4s, v18.4s, v24.4s[0]
+	fmla	v9.4s, v18.4s, v25.4s[0]
+	fmla	v10.4s, v18.4s, v26.4s[0]
+	fmla	v11.4s, v18.4s, v27.4s[0]
+	ldr		q19, [x14], #16
+	fmla	v12.4s, v19.4s, v24.4s[0]
+	fmla	v13.4s, v19.4s, v25.4s[0]
+	fmla	v14.4s, v19.4s, v26.4s[0]
+	fmla	v15.4s, v19.4s, v27.4s[0]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_16x4_lib4, .-inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+// x11  <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_16X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_16x4_lib4, %function
+inner_scale_ab_16x4_lib4:
+#endif
+
+	ld1		{v28.4s}, [x8]
+
+	fmul	v0.4s, v0.4s, v28.4s[0]
+	fmul	v1.4s, v1.4s, v28.4s[0]
+	fmul	v2.4s, v2.4s, v28.4s[0]
+	fmul	v3.4s, v3.4s, v28.4s[0]
+	fmul	v4.4s, v4.4s, v28.4s[0]
+	fmul	v5.4s, v5.4s, v28.4s[0]
+	fmul	v6.4s, v6.4s, v28.4s[0]
+	fmul	v7.4s, v7.4s, v28.4s[0]
+	fmul	v8.4s, v8.4s, v28.4s[0]
+	fmul	v9.4s, v9.4s, v28.4s[0]
+	fmul	v10.4s, v10.4s, v28.4s[0]
+	fmul	v11.4s, v11.4s, v28.4s[0]
+	fmul	v12.4s, v12.4s, v28.4s[0]
+	fmul	v13.4s, v13.4s, v28.4s[0]
+	fmul	v14.4s, v14.4s, v28.4s[0]
+	fmul	v15.4s, v15.4s, v28.4s[0]
+
+	ld1		{v28.4s}, [x9]
+
+	add		x12, x10, x11
+	add		x13, x12, x11
+	add		x14, x13, x11
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v25.4s, v28.4s[0]
+	fmla	v2.4s, v26.4s, v28.4s[0]
+	fmla	v3.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+	fmla	v4.4s, v24.4s, v28.4s[0]
+	fmla	v5.4s, v25.4s, v28.4s[0]
+	fmla	v6.4s, v26.4s, v28.4s[0]
+	fmla	v7.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+	fmla	v8.4s, v24.4s, v28.4s[0]
+	fmla	v9.4s, v25.4s, v28.4s[0]
+	fmla	v10.4s, v26.4s, v28.4s[0]
+	fmla	v11.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x14], #64
+	fmla	v12.4s, v24.4s, v28.4s[0]
+	fmla	v13.4s, v25.4s, v28.4s[0]
+	fmla	v14.4s, v26.4s, v28.4s[0]
+	fmla	v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_16x4_lib4, .-inner_scale_ab_16x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+// x9   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_16X4_LIB4
+#else
+	.align 4
+	.type inner_store_16x4_lib4, %function
+inner_store_16x4_lib4:
+#endif
+
+	add		x10, x8, x9
+	add		x11, x10, x9
+	add		x12, x11, x9
+
+	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+	st1		{v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+	st1		{v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+	st1		{v12.4s, v13.4s, v14.4s, v15.4s}, [x12], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_16x4_lib4, .-inner_store_16x4_lib4
+#endif
+
+
+
+
+
+//                                w0        x1            x2        w3       x4        x5           x6        w7       sp+0      sp+8
+// void kernel_sgemm_nt_16x4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+
+	.align	4
+	.global	kernel_sgemm_nt_16x4_lib4
+	.type	kernel_sgemm_nt_16x4_lib4, %function
+kernel_sgemm_nt_16x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+	fmov    d8, d0
+	fmov    d9, d0
+	fmov    d10, d0
+	fmov    d11, d0
+	fmov    d12, d0
+	fmov    d13, d0
+	fmov    d14, d0
+	fmov    d15, d0
+
+
+
+	// call inner kernel gemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		w10, w3 // sda
+	lsl		w10, w10, #4 // 16*sda
+	mov		x11, x4 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x5 // beta
+	mov		x10, x6 // C
+	mov		w11, w7 // C
+	lsl		w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB4
+#else
+	bl inner_scale_ab_16x4_lib4
+#endif
+
+
+
+	// store n
+	ldr		x8, [sp, #(STACKSIZE + 0)] // D
+	ldr		w9, [sp, #(STACKSIZE + 8)] // sdd
+	lsl		w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB4
+#else
+	bl inner_store_16x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_4x4_lib4.S b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..6d3850d
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,354 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	add sp, sp, #-(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10   <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	// prefetch
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x10, #0]
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// preload
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+
+	// prefetch
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x10, #32]
+
+	// main loop
+1:
+	
+
+	// unroll 0
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	ld1		{v26.2d, v27.2d}, [x9], #32
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v30.2d, v31.2d}, [x10], #32
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	prfm	PLDL1KEEP, [x10, #64]
+
+	// unroll 1
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	sub		w8, w8, #4
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+
+	// unroll 2
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	fmla	v1.4s, v26.4s, v30.4s[1]
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	fmla	v3.4s, v26.4s, v30.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+
+	cmp		w8, #4
+	bgt		1b
+
+	sub		x9, x9, #32
+	sub		x10, x10, #32
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	// unroll 0
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+
+	// unroll 1
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+
+	// unroll 2
+	ld1		{v24.2d, v25.2d}, [x9], #32
+	ld1		{v28.2d, v29.2d}, [x10], #32
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+
+	sub		w8, w8, #4
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+3: // clean1-up loop
+
+	// unroll 0
+	ld1		{v24.2d}, [x9], #16
+	ld1		{v28.2d}, [x10], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+	ld1		{v28.2d}, [x8]
+
+	fmul	v0.4s, v0.4s, v28.4s[0]
+	fmul	v1.4s, v1.4s, v28.4s[0]
+	fmul	v2.4s, v2.4s, v28.4s[0]
+	fmul	v3.4s, v3.4s, v28.4s[0]
+
+	ld1		{v28.2d}, [x9]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v25.4s, v28.4s[0]
+	fmla	v2.4s, v26.4s, v28.4s[0]
+	fmla	v3.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.align 4
+	.type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+//                               w0        x1             x2         x3         x4            x5         x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+	.align	4
+	.global	kernel_sgemm_nt_4x4_lib4
+	.type	kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+
+
+
+	// call inner kernel dgemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		x10, x3 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x4 // beta
+	mov		x10, x5 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+	bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+	// store n
+	mov		x8, x6
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+	bl inner_store_4x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
diff --git a/kernel/armv8a/kernel_sgemm_8x4_lib4.S b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..016af72
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,433 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	sub sp, sp, #(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10  <- sda
+// x11  <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	add		x12, x9, x10
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #0]
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x12, #0]
+
+	// preload
+	ld1		{v24.4s, v25.4s}, [x9], #32
+	ld1		{v28.4s, v29.4s}, [x11], #32
+	ld1		{v20.4s, v21.4s}, [x12], #32
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #32]
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x12, #32]
+
+	// main loop
+1:
+
+	// unroll 0
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	ld1		{v26.4s, v27.4s}, [x9], #32
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v30.4s, v31.4s}, [x11], #32
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	ld1		{v22.4s, v23.4s}, [x12], #32
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	sub		w8, w8, #4
+
+	// unroll 1
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+	cmp		w8, #4
+
+	// unroll 2
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	ld1		{v24.4s, v25.4s}, [x9], #32
+	fmla	v1.4s, v26.4s, v30.4s[1]
+	ld1		{v28.4s, v29.4s}, [x11], #32
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	ld1		{v20.4s, v21.4s}, [x12], #32
+	fmla	v3.4s, v26.4s, v30.4s[3]
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+
+	bgt		1b
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	// unroll 0
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	ld1		{v26.4s, v27.4s}, [x9], #32
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v30.4s, v31.4s}, [x11], #32
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	ld1		{v22.4s, v23.4s}, [x12], #32
+	fmla	v3.4s, v24.4s, v28.4s[3]
+//	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v4.4s, v20.4s, v28.4s[0]
+//	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+//	prfm	PLDL1KEEP, [x12, #64]
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	sub		w8, w8, #4
+
+	// unroll 1
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+//	cmp		w8, #4
+
+	// unroll 2
+	fmla	v0.4s, v26.4s, v30.4s[0]
+//	ld1		{v24.4s, v25.4s}, [x9], #32
+	fmla	v1.4s, v26.4s, v30.4s[1]
+//	ld1		{v28.4s, v29.4s}, [x11], #32
+	fmla	v2.4s, v26.4s, v30.4s[2]
+//	ld1		{v20.4s, v21.4s}, [x12], #32
+	fmla	v3.4s, v26.4s, v30.4s[3]
+//	ld1		{v16.4s, v17.4s}, [x13], #32
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+	sub		x9, x9, #32
+	sub		x12, x12, #32
+	sub		x11, x11, #32
+
+3: // clean1-up loop
+
+	// unroll 0
+
+	ld1		{v28.4s}, [x11], #16
+	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v20.4s}, [x12], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+// x11  <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+	ld1		{v28.4s}, [x8]
+
+	fmul	v0.4s, v0.4s, v28.4s[0]
+	fmul	v1.4s, v1.4s, v28.4s[0]
+	fmul	v2.4s, v2.4s, v28.4s[0]
+	fmul	v3.4s, v3.4s, v28.4s[0]
+	fmul	v4.4s, v4.4s, v28.4s[0]
+	fmul	v5.4s, v5.4s, v28.4s[0]
+	fmul	v6.4s, v6.4s, v28.4s[0]
+	fmul	v7.4s, v7.4s, v28.4s[0]
+
+	ld1		{v28.4s}, [x9]
+
+	add		x12, x10, x11
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v25.4s, v28.4s[0]
+	fmla	v2.4s, v26.4s, v28.4s[0]
+	fmla	v3.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+	fmla	v4.4s, v24.4s, v28.4s[0]
+	fmla	v5.4s, v25.4s, v28.4s[0]
+	fmla	v6.4s, v26.4s, v28.4s[0]
+	fmla	v7.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+// x9   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_8X4_LIB4
+#else
+	.align 4
+	.type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+	add		x10, x8, x9
+
+	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+	st1		{v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+//                               w0        x1             x2         w3       x4         x5            x6         w7       sp+0       sp+8
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+	.align	4
+	.global	kernel_sgemm_nt_8x4_lib4
+	.type	kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+
+
+
+	// call inner kernel gemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		w10, w3 // sda
+	lsl		w10, w10, #4 // 16*sda
+	mov		x11, x4 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x5 // beta
+	mov		x10, x6 // C
+	mov		w11, w7 // C
+	lsl		w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+	bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+	// store n
+	ldr		x8, [sp, #(STACKSIZE + 0)] // D
+	ldr		w9, [sp, #(STACKSIZE + 8)] // sdd
+	lsl		w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+	bl inner_store_8x4_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_8x8_lib4.S b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
new file mode 100644
index 0000000..6c8c090
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
@@ -0,0 +1,565 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+	sub sp, sp, #(11 * 16); \
+	stp d8, d9, [sp, #(0 * 16)]; \
+	stp d10, d11, [sp, #(1 * 16)]; \
+	stp d12, d13, [sp, #(2 * 16)]; \
+	stp d14, d15, [sp, #(3 * 16)]; \
+	stp x18, x19, [sp, #(4 * 16)]; \
+	stp x20, x21, [sp, #(5 * 16)]; \
+	stp x22, x23, [sp, #(6 * 16)]; \
+	stp x24, x25, [sp, #(7 * 16)]; \
+	stp x26, x27, [sp, #(8 * 16)]; \
+	stp x28, x29, [sp, #(9 * 16)]; \
+	str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+	ldp d8, d9, [sp, #(0 * 16)]; \
+	ldp d10, d11, [sp, #(1 * 16)]; \
+	ldp d12, d13, [sp, #(2 * 16)]; \
+	ldp d14, d15, [sp, #(3 * 16)]; \
+	ldp x18, x19, [sp, #(4 * 16)]; \
+	ldp x20, x21, [sp, #(5 * 16)]; \
+	ldp x22, x23, [sp, #(6 * 16)]; \
+	ldp x24, x25, [sp, #(7 * 16)]; \
+	ldp x26, x27, [sp, #(8 * 16)]; \
+	ldp x28, x29, [sp, #(9 * 16)]; \
+	ldr x30, [sp, #(10 * 16)]; \
+	add sp, sp, #(11 * 16);
+
+
+
+
+
+	.text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8   <- k
+// x9   <- A
+// x10  <- sda
+// x11  <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+	.align	4
+	.type inner_kernel_gemm_add_nt_8x8_lib4, %function
+inner_kernel_gemm_add_nt_8x8_lib4:
+#endif
+
+	// early return
+	cmp		w8, #0
+	ble		2f // return
+
+	add		x13, x9, x10
+	add		x14, x11, x12
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #0]
+	prfm	PLDL1KEEP, [x9, #0]
+	prfm	PLDL1KEEP, [x13, #0]
+	prfm	PLDL1KEEP, [x14, #0]
+
+	// preload
+	ld1		{v24.4s, v25.4s}, [x9], #32
+	ld1		{v28.4s, v29.4s}, [x11], #32
+	ld1		{v20.4s, v21.4s}, [x13], #32
+	ld1		{v16.4s, v17.4s}, [x14], #32
+
+	cmp		w8, #4
+	ble		0f // consider clean up loop
+
+	// prefetch
+	prfm	PLDL1KEEP, [x11, #32]
+	prfm	PLDL1KEEP, [x9, #32]
+	prfm	PLDL1KEEP, [x13, #32]
+	prfm	PLDL1KEEP, [x14, #32]
+
+	// main loop
+1:
+
+	// unroll 0
+	ld1		{v26.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v27.4s}, [x9], #16
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v30.4s}, [x11], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	ld1		{v31.4s}, [x11], #16
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v22.4s}, [x13], #16
+	fmla	v8.4s, v24.4s, v16.4s[0]
+	fmla	v9.4s, v24.4s, v16.4s[1]
+	ld1		{v23.4s}, [x13], #16
+	fmla	v10.4s, v24.4s, v16.4s[2]
+	fmla	v11.4s, v24.4s, v16.4s[3]
+	ld1		{v18.4s}, [x14], #16
+	fmla	v12.4s, v20.4s, v16.4s[0]
+	fmla	v13.4s, v20.4s, v16.4s[1]
+	ld1		{v19.4s}, [x14], #16
+	fmla	v14.4s, v20.4s, v16.4s[2]
+	fmla	v15.4s, v20.4s, v16.4s[3]
+
+	// unroll 1
+	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+	prfm	PLDL1KEEP, [x13, #64]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+	prfm	PLDL1KEEP, [x14, #64]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+	sub		w8, w8, #4
+	fmla	v8.4s, v25.4s, v17.4s[0]
+	fmla	v9.4s, v25.4s, v17.4s[1]
+	fmla	v10.4s, v25.4s, v17.4s[2]
+	fmla	v11.4s, v25.4s, v17.4s[3]
+	fmla	v12.4s, v21.4s, v17.4s[0]
+	fmla	v13.4s, v21.4s, v17.4s[1]
+	cmp		w8, #4
+	fmla	v14.4s, v21.4s, v17.4s[2]
+	fmla	v15.4s, v21.4s, v17.4s[3]
+
+	// unroll 2
+	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	fmla	v1.4s, v26.4s, v30.4s[1]
+	ld1		{v25.4s}, [x9], #16
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	fmla	v3.4s, v26.4s, v30.4s[3]
+	ld1		{v28.4s}, [x11], #16
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+	ld1		{v29.4s}, [x11], #16
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+	ld1		{v20.4s}, [x13], #16
+	fmla	v8.4s, v26.4s, v18.4s[0]
+	fmla	v9.4s, v26.4s, v18.4s[1]
+	ld1		{v21.4s}, [x13], #16
+	fmla	v10.4s, v26.4s, v18.4s[2]
+	fmla	v11.4s, v26.4s, v18.4s[3]
+	ld1		{v16.4s}, [x14], #16
+	fmla	v12.4s, v22.4s, v18.4s[0]
+	fmla	v13.4s, v22.4s, v18.4s[1]
+	ld1		{v17.4s}, [x14], #16
+	fmla	v14.4s, v22.4s, v18.4s[2]
+	fmla	v15.4s, v22.4s, v18.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+	fmla	v8.4s, v27.4s, v19.4s[0]
+	fmla	v9.4s, v27.4s, v19.4s[1]
+	fmla	v10.4s, v27.4s, v19.4s[2]
+	fmla	v11.4s, v27.4s, v19.4s[3]
+	fmla	v12.4s, v23.4s, v19.4s[0]
+	fmla	v13.4s, v23.4s, v19.4s[1]
+	fmla	v14.4s, v23.4s, v19.4s[2]
+	fmla	v15.4s, v23.4s, v19.4s[3]
+
+	bgt		1b
+
+0:
+
+	cmp		w8, #3
+	ble		4f
+
+	// unroll 0
+	ld1		{v26.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	ld1		{v27.4s}, [x9], #16
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v30.4s}, [x11], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	ld1		{v31.4s}, [x11], #16
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v22.4s}, [x13], #16
+	fmla	v8.4s, v24.4s, v16.4s[0]
+	fmla	v9.4s, v24.4s, v16.4s[1]
+	ld1		{v23.4s}, [x13], #16
+	fmla	v10.4s, v24.4s, v16.4s[2]
+	fmla	v11.4s, v24.4s, v16.4s[3]
+	ld1		{v18.4s}, [x14], #16
+	fmla	v12.4s, v20.4s, v16.4s[0]
+	fmla	v13.4s, v20.4s, v16.4s[1]
+	ld1		{v19.4s}, [x14], #16
+	fmla	v14.4s, v20.4s, v16.4s[2]
+	fmla	v15.4s, v20.4s, v16.4s[3]
+
+	// unroll 1
+//	prfm	PLDL1KEEP, [x11, #64]
+	fmla	v0.4s, v25.4s, v29.4s[0]
+	fmla	v1.4s, v25.4s, v29.4s[1]
+//	prfm	PLDL1KEEP, [x9, #64]
+	fmla	v2.4s, v25.4s, v29.4s[2]
+	fmla	v3.4s, v25.4s, v29.4s[3]
+//	prfm	PLDL1KEEP, [x13, #64]
+	fmla	v4.4s, v21.4s, v29.4s[0]
+	fmla	v5.4s, v21.4s, v29.4s[1]
+//	prfm	PLDL1KEEP, [x14, #64]
+	fmla	v6.4s, v21.4s, v29.4s[2]
+	fmla	v7.4s, v21.4s, v29.4s[3]
+	sub		w8, w8, #4
+	fmla	v8.4s, v25.4s, v17.4s[0]
+	fmla	v9.4s, v25.4s, v17.4s[1]
+	fmla	v10.4s, v25.4s, v17.4s[2]
+	fmla	v11.4s, v25.4s, v17.4s[3]
+	fmla	v12.4s, v21.4s, v17.4s[0]
+	fmla	v13.4s, v21.4s, v17.4s[1]
+	cmp		w8, #4
+	fmla	v14.4s, v21.4s, v17.4s[2]
+	fmla	v15.4s, v21.4s, v17.4s[3]
+
+	// unroll 2
+//	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v26.4s, v30.4s[0]
+	fmla	v1.4s, v26.4s, v30.4s[1]
+//	ld1		{v25.4s}, [x9], #16
+	fmla	v2.4s, v26.4s, v30.4s[2]
+	fmla	v3.4s, v26.4s, v30.4s[3]
+//	ld1		{v28.4s}, [x11], #16
+	fmla	v4.4s, v22.4s, v30.4s[0]
+	fmla	v5.4s, v22.4s, v30.4s[1]
+//	ld1		{v29.4s}, [x11], #16
+	fmla	v6.4s, v22.4s, v30.4s[2]
+	fmla	v7.4s, v22.4s, v30.4s[3]
+//	ld1		{v20.4s}, [x13], #16
+	fmla	v8.4s, v26.4s, v18.4s[0]
+	fmla	v9.4s, v26.4s, v18.4s[1]
+//	ld1		{v21.4s}, [x13], #16
+	fmla	v10.4s, v26.4s, v18.4s[2]
+	fmla	v11.4s, v26.4s, v18.4s[3]
+//	ld1		{v16.4s}, [x14], #16
+	fmla	v12.4s, v22.4s, v18.4s[0]
+	fmla	v13.4s, v22.4s, v18.4s[1]
+//	ld1		{v17.4s}, [x14], #16
+	fmla	v14.4s, v22.4s, v18.4s[2]
+	fmla	v15.4s, v22.4s, v18.4s[3]
+
+	// unroll 3
+	fmla	v0.4s, v27.4s, v31.4s[0]
+	fmla	v1.4s, v27.4s, v31.4s[1]
+	fmla	v2.4s, v27.4s, v31.4s[2]
+	fmla	v3.4s, v27.4s, v31.4s[3]
+	fmla	v4.4s, v23.4s, v31.4s[0]
+	fmla	v5.4s, v23.4s, v31.4s[1]
+	fmla	v6.4s, v23.4s, v31.4s[2]
+	fmla	v7.4s, v23.4s, v31.4s[3]
+	fmla	v8.4s, v27.4s, v19.4s[0]
+	fmla	v9.4s, v27.4s, v19.4s[1]
+	fmla	v10.4s, v27.4s, v19.4s[2]
+	fmla	v11.4s, v27.4s, v19.4s[3]
+	fmla	v12.4s, v23.4s, v19.4s[0]
+	fmla	v13.4s, v23.4s, v19.4s[1]
+	fmla	v14.4s, v23.4s, v19.4s[2]
+	fmla	v15.4s, v23.4s, v19.4s[3]
+
+	b		2f // return
+
+4: // consider clean1-up loop
+
+	cmp		w8, #0
+	ble		2f // return
+
+	sub		x9, x9, #32
+	sub		x13, x13, #32
+	sub		x11, x11, #32
+	sub		x14, x14, #32
+
+3: // clean1-up loop
+
+	// unroll 0
+
+	ld1		{v28.4s}, [x11], #16
+	ld1		{v24.4s}, [x9], #16
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v24.4s, v28.4s[1]
+	fmla	v2.4s, v24.4s, v28.4s[2]
+	fmla	v3.4s, v24.4s, v28.4s[3]
+	ld1		{v20.4s}, [x13], #16
+	fmla	v4.4s, v20.4s, v28.4s[0]
+	fmla	v5.4s, v20.4s, v28.4s[1]
+	fmla	v6.4s, v20.4s, v28.4s[2]
+	fmla	v7.4s, v20.4s, v28.4s[3]
+	ld1		{v16.4s}, [x14], #16
+	fmla	v8.4s, v24.4s, v16.4s[0]
+	fmla	v9.4s, v24.4s, v16.4s[1]
+	fmla	v10.4s, v24.4s, v16.4s[2]
+	fmla	v11.4s, v24.4s, v16.4s[3]
+	fmla	v12.4s, v20.4s, v16.4s[0]
+	fmla	v13.4s, v20.4s, v16.4s[1]
+	fmla	v14.4s, v20.4s, v16.4s[2]
+	fmla	v15.4s, v20.4s, v16.4s[3]
+
+	sub		w8, w8, #1
+	cmp		w8, #0
+	bgt		3b
+
+2: // return
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- alpha
+// x9   <- beta
+// x10  <- C
+// x11  <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_SCALE_AB_8X8_LIB4
+#else
+	.align	4
+	.type inner_scale_ab_8x8_lib4, %function
+inner_scale_ab_8x8_lib4:
+#endif
+
+	ld1		{v28.4s}, [x8]
+
+	fmul	v0.4s, v0.4s, v28.4s[0]
+	fmul	v1.4s, v1.4s, v28.4s[0]
+	fmul	v2.4s, v2.4s, v28.4s[0]
+	fmul	v3.4s, v3.4s, v28.4s[0]
+	fmul	v4.4s, v4.4s, v28.4s[0]
+	fmul	v5.4s, v5.4s, v28.4s[0]
+	fmul	v6.4s, v6.4s, v28.4s[0]
+	fmul	v7.4s, v7.4s, v28.4s[0]
+	fmul	v8.4s, v8.4s, v28.4s[0]
+	fmul	v9.4s, v9.4s, v28.4s[0]
+	fmul	v10.4s, v10.4s, v28.4s[0]
+	fmul	v11.4s, v11.4s, v28.4s[0]
+	fmul	v12.4s, v12.4s, v28.4s[0]
+	fmul	v13.4s, v13.4s, v28.4s[0]
+	fmul	v14.4s, v14.4s, v28.4s[0]
+	fmul	v15.4s, v15.4s, v28.4s[0]
+
+	ld1		{v28.4s}, [x9]
+
+	add		x12, x10, x11
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v0.4s, v24.4s, v28.4s[0]
+	fmla	v1.4s, v25.4s, v28.4s[0]
+	fmla	v2.4s, v26.4s, v28.4s[0]
+	fmla	v3.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+	fmla	v4.4s, v24.4s, v28.4s[0]
+	fmla	v5.4s, v25.4s, v28.4s[0]
+	fmla	v6.4s, v26.4s, v28.4s[0]
+	fmla	v7.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+	fmla	v8.4s, v24.4s, v28.4s[0]
+	fmla	v9.4s, v25.4s, v28.4s[0]
+	fmla	v10.4s, v26.4s, v28.4s[0]
+	fmla	v11.4s, v27.4s, v28.4s[0]
+
+	ld1		{v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+	fmla	v12.4s, v24.4s, v28.4s[0]
+	fmla	v13.4s, v25.4s, v28.4s[0]
+	fmla	v14.4s, v26.4s, v28.4s[0]
+	fmla	v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8   <- D
+// x9   <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+	.macro INNER_STORE_8X8_LIB4
+#else
+	.align 4
+	.type inner_store_8x8_lib4, %function
+inner_store_8x8_lib4:
+#endif
+
+	add		x10, x8, x9
+
+	st1		{v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+	st1		{v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+	st1		{v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
+	st1		{v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+	.size	inner_store_8x8_lib4, .-inner_store_8x8_lib4
+#endif
+
+
+
+
+
+//                               w0        x1             x2         w3       x4         w5       x6             x7        sp+0     sp+8       sp+16
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
+
+	.align	4
+	.global	kernel_sgemm_nt_8x8_lib4
+	.type	kernel_sgemm_nt_8x8_lib4, %function
+kernel_sgemm_nt_8x8_lib4:
+	
+
+
+	PROLOGUE
+
+
+
+	// TODO zero the entire 128-bit register ???
+	fmov	d0, xzr
+	fmov    d1, d0
+	fmov    d2, d0
+	fmov    d3, d0
+	fmov    d4, d0
+	fmov    d5, d0
+	fmov    d6, d0
+	fmov    d7, d0
+	fmov    d8, d0
+	fmov    d9, d0
+	fmov    d10, d0
+	fmov    d11, d0
+	fmov    d12, d0
+	fmov    d13, d0
+	fmov    d14, d0
+	fmov    d15, d0
+
+
+
+	// call inner kernel gemm nt
+	mov		w8, w0 // kmax
+	mov		x9, x2 // A
+	mov		w10, w3 // sda
+	lsl		w10, w10, #4 // 16*sda
+	mov		x11, x4 // B
+	mov		w12, w5 // sdb
+	lsl		w12, w12, #4 // 16*sdb
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+	bl	inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+	// call inner blend for generic alpha and beta
+	mov		x8, x1 // alpha
+	mov		x9, x6 // beta
+	mov		x10, x7 // C
+	ldr		w11, [sp, #(STACKSIZE + 0)] // D
+	lsl		w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+	bl inner_scale_ab_8x8_lib4
+#endif
+
+
+
+	// store n
+	ldr		x8, [sp, #(STACKSIZE + 8)] // D
+	ldr		w9, [sp, #(STACKSIZE + 16)] // sdd
+	lsl		w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB4
+#else
+	bl inner_store_8x8_lib4
+#endif
+
+
+
+	EPILOGUE
+
+	mov	x0, #0
+
+	ret
+
+
+
+
diff --git a/kernel/avx/Makefile b/kernel/avx/Makefile
new file mode 100644
index 0000000..f260086
--- /dev/null
+++ b/kernel/avx/Makefile
@@ -0,0 +1,54 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_12_lib4.o kernel_dgemv_8_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o kernel_dgebp_lib4.o
+OBJS += kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgetr_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/avx/kernel_dgebp_lib4.S b/kernel/avx/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..0e8581e
--- /dev/null
+++ b/kernel/avx/kernel_dgebp_lib4.S
@@ -0,0 +1,935 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+//                               1      2          3        4          5
+// void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_lib4
+	.type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_lib4
+	.def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmovapd	0(%r11, %r12, 1), %ymm4
+	vmovapd	32(%r11, %r12, 1), %ymm5
+	vmovapd	64(%r11, %r12, 1), %ymm6
+	vmovapd	96(%r11, %r12, 1), %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3        4          5          6        7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+	movq	ARG7, %rax // km
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC01(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmaskmovpd	0(%r11, %r12, 1), %ymm15, %ymm4
+	vmaskmovpd	32(%r11, %r12, 1), %ymm15, %ymm5
+	vmaskmovpd	64(%r11, %r12, 1), %ymm15, %ymm6
+	vmaskmovpd	96(%r11, %r12, 1), %ymm15, %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm4, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm5, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm6, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm8, %ymm8
+	vmulpd			%ymm7, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+//                               1      2          3          4
+// void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_lib4
+	.type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_lib4
+	.def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3          4          5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC00(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmaskmovpd	0(%r11), %ymm15, %ymm0
+	vmaskmovpd	32(%r11), %ymm15, %ymm1
+	vmaskmovpd	64(%r11), %ymm15, %ymm2
+	vmaskmovpd	96(%r11), %ymm15, %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vmulpd			%ymm1, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vmulpd			%ymm2, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vmulpd			%ymm3, %ymm15, %ymm14
+	vsubpd			%ymm14, %ymm4, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+	.align 5
+#endif
+	.double 0.5
+	.double 1.5
+	.double 2.5
+	.double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+	.align 5
+#endif
+	.double 4.5
+	.double 5.5
+	.double 6.5
+	.double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+	.align 5
+#endif
+	.double 8.5
+	.double 9.5
+	.double 10.5
+	.double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_dgemm_4x4_lib4.S b/kernel/avx/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..95ff6ea
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9906 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+
+//	cmpl	$3, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	subl	$1, %r10d
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	cmpl	$4, %r10d
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+//	cmpl	$3, %r10d
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	48(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	56(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	88(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-24(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, -32(%r12)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r10), %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r14d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r14d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A+3*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$8, %r12 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r14d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$16, %r12 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r14d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r14d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r14d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r10), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r11
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif	
+#endif	
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif	
+#endif	
+	
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm1, %ymm13, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender_loader for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif	
+#endif	
+	
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_lib4, @function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10), %ymm12
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+	vmovapd		32(%r10), %ymm12
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+	vmovapd		64(%r10), %ymm12
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl	$3, %r12d
+	jle		0f
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+0:
+	cmpl	$2, %r12d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+1:
+	cmpl	$1, %r12d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+	vmovddup	%xmm14, %xmm14
+
+	// first column
+//	vblendpd	$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd		%ymm0, %ymm12
+	vmovddup	%xmm0, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 0(%r10)
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vblendpd	$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+	vmovddup	%xmm1, %xmm12
+	vperm2f128	$0x00, %ymm12, %ymm12, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vblendpd	$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 8(%r10)
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vblendpd	$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+	vmovddup	%xmm2, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vblendpd	$0x2, %ymm2, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm2, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vblendpd	$0x4, %ymm2, %ymm12, %ymm12
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vmovddup	%xmm13, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 16(%r10)
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vblendpd	$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+	vmovddup	%xmm3, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x2, %ymm3, %ymm13, %ymm12
+
+	vpermilpd	$0x3, %xmm3, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x4, %ymm3, %ymm12, %ymm12
+
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vblendpd	$0x8, %ymm3, %ymm12, %ymm12
+	
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vdivpd		%xmm13, %xmm14, %xmm13
+//	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd		%xmm13, 24(%r10)
+//	vmulpd		%ymm3, %ymm13, %ymm3
+	vblendpd	$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r12d
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	jl			0f // end
+	cmpl		$3, %r12d
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	jl			0f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10   <- D
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		3f
+
+0:
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%rbx)
+	jl			3f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%rbx)
+	jl			3f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%rbx)
+	je			3f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%rbx)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			3f // end
+	vblendpd	$0x4, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x2, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   1      2              3          4          5             6            7          8        9            10         11       12      13      14      15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx       r8         r9       rsp+8         rsp+16    rsp+24     rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9           rsp+8      rsp+16   rsp+24       rsp+32     rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4            5          6        7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi            rdx        rcx          r8         r9       rsp+8        rsp+16     rsp+24   rsp+32  rsp+40  rsp+48  rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10
+	movq	ARG4, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  edi    rsi        rdx        rcx        r8         r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                     edi    rsi        rdx        rcx        r8         r9                  rsp+8   rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                           edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24             rsp+32   rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8     
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                            edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8               rsp+16  rsp+24  
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               edi     rsi         rdx         ecx     r8          r9          rsp+8    rsp+16     rsp+24     rsp+32                rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8   rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8                rsp+16  rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double  *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16  rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+	movq	ARG9, %r12  // km 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx        rcx      r8         r9         rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx        rcx      r8         r9         rsp+8               rsp+16  rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8      rsp+16  rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                             1         2           3           4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_4_lib4
+	.type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_4_lib4
+	.def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG2, %r12 // V
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG2, %r12 // V
+
+	//
+	vmovapd			0(%r11), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm0
+	//
+	vmovapd			32(%r11), %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	//
+	vmovapd			64(%r11), %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	//
+	vmovapd			96(%r11), %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	96(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+	//
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+	//
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+	//
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	104(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vbroadcastsd	112(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vmovapd			%ymm12, 96(%r12)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_8x4_lib4.S b/kernel/avx/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..e9f1f34
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,13154 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+//	movq	%r11, %r15 // A1 <- A0
+//	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmovapd 0(%r13), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 32(%r15), %ymm11 // A1[4]
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 64(%r15), %ymm9 // A1[8]
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 96(%r15), %ymm11 // A1[12]
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	addq	$128, %r11
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+	// unroll 3
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 32(%r15), %ymm11 // A1[4]
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 64(%r15), %ymm9 // A1[8]
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 96(%r15), %ymm11 // A1[12]
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+//	addq	$128, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+	// unroll 3
+//	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+//	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+
+//	cmpl	$3, %r10d
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+//	vmovapd 0(%r15), %ymm9 // A1[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	addq	$32, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r13
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+//	addq	$32, %r15
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	subl	$1, %r10d
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmovapd 0(%r13), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	// unroll 3
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+	cmpl	$4, %r10d
+
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	// unroll 3
+//	vmovapd 0(%r13), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+//	cmpl	$3, %r10d
+
+//	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vmulpd	%ymm11, %ymm13, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	vmulpd	%ymm11, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm0, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm9, %ymm12, %ymm15
+	vsubpd	%ymm15, %ymm4, %ymm4
+	addq	$32, %r11
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	addq	$32, %r13
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm5, %ymm5
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm3, %ymm3
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm7, %ymm7
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm2, %ymm2
+	subl	$1, %r10d
+	vmulpd	%ymm9, %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm6, %ymm6
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	addq	%r14, %r13
+	vmulpd			%ymm11, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+	prefetcht0	128(%r12, %r13, 2) // software prefetch
+	prefetcht0	192(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vmulpd			%ymm14, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	48(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	56(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	88(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-24(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, -32(%r12)
+	vmovapd			%ymm14, -32(%r12, %r13, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm7, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %ebx
+	subl			%r15d, %ebx // 4-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,4-offsetB)
+
+	movl			%r15d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r13 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A0+1*bs*sizeof(float)
+	addq			$8, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vmulpd			%ymm12, %ymm13, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm12
+	vmovapd			0(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			32(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			32(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	40(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			64(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	72(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	80(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+
+	vbroadcastsd	96(%r12), %ymm12
+	vmovapd			96(%r10), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			96(%r10, %r11, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	104(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	112(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	120(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+	addq			$128, %r10
+	addq			$128, %r12
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	addq			$32, %r11
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	addq			$32, %r13
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	addq			$32, %r11
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	addq			$32, %r11
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm4, %ymm15, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm5, %ymm15, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	addq			$32, %r11
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm6, %ymm15, %ymm6
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	addq			$32, %r13
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r15d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r15d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A0+3*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$8, %r13 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r15d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A0+2*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$16, %r13 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r15d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r15d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r15d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+	
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm3, %ymm14, %ymm3
+
+	vmovapd		0(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm4, %ymm14, %ymm4
+	vmovapd		32(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm5, %ymm14, %ymm5
+	vmovapd		64(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm6, %ymm14, %ymm6
+	vmovapd		96(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm7, %ymm14, %ymm7
+
+	jmp		3f
+
+0:
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm0, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm1, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm2, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm3, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm4, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm5, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm6, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm0, %ymm1, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm2, %ymm3, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm12, %ymm14, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm13, %ymm15, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm4, %ymm5, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm6, %ymm7, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm12, %ymm14, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm13, %ymm15, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm0, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm1, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm2, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm3, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm4, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm5, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm6, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vmulpd		%ymm14, %ymm15, %ymm15
+	vaddpd		%ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm3, %ymm14, %ymm3
+
+	vmovapd		0(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm4, %ymm14, %ymm4
+	vmovapd		32(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm5, %ymm14, %ymm5
+	vmovapd		64(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm6, %ymm14, %ymm6
+	vmovapd		96(%r13, %r14, 1), %ymm14
+	vmulpd		%ymm14, %ymm15, %ymm14
+	vaddpd		%ymm7, %ymm14, %ymm7
+
+	jmp		3f
+
+0:
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r13, %r14, 1), %ymm13
+	vmovapd		0(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm0, %ymm12, %ymm0
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm4, %ymm13, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%r13, %r14, 1), %ymm13
+	vmovapd		32(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm1, %ymm12, %ymm1
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm5, %ymm13, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r13, %r14, 1), %ymm13
+	vmovapd		64(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm2, %ymm12, %ymm2
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm6, %ymm13, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%r13, %r14, 1), %ymm13
+	vmovapd		96(%r13, %r14, 2), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vmulpd		%ymm12, %ymm15, %ymm12
+	vaddpd		%ymm3, %ymm12, %ymm3
+	vmulpd		%ymm13, %ymm15, %ymm13
+	vaddpd		%ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	// alg==1
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x4_lib4, @function
+inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vmulpd		%ymm4, %ymm13, %ymm4
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vmulpd		%ymm5, %ymm13, %ymm5
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vmulpd		%ymm6, %ymm13, %ymm6
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm6, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+	vmulpd		%ymm7, %ymm13, %ymm7
+
+	jmp				0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x4_lib4, .-inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization vs
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd		%xmm0, %xmm0, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+2:
+	vmovsd		%xmm13, 0(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm0
+	vmulpd		%ymm4, %ymm13, %ymm4
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm1, %ymm1
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm0, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm4, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$2, %r11d
+	jl			0f // ret
+
+	vpermilpd	$0x3, %xmm1, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+4:
+	vmovsd		%xmm13, 8(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm1
+	vmulpd		%ymm5, %ymm13, %ymm5
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilpd	$0x0, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm2, %ymm2
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm6, %ymm6
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm1, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm5, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$3, %r11d
+	jl			0f // ret
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+6:
+	vmovsd		%xmm13, 16(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm2
+	vmulpd		%ymm6, %ymm13, %ymm6
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilpd	$0xf, %ymm11, %ymm13
+	vmulpd		%ymm2, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm3, %ymm3
+	vmulpd		%ymm6, %ymm13, %ymm12
+	vsubpd		%ymm12, %ymm7, %ymm7
+
+	cmpl		$4, %r11d
+	jl			0f // ret
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilpd	$0x3, %xmm13, %xmm13
+	vucomisd	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtsd		%xmm13, %xmm13, %xmm13
+	vdivsd		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 24(%r10)
+	vmovddup	%xmm13, %xmm13
+	vperm2f128	$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd		%ymm3, %ymm13, %ymm3
+	vmulpd		%ymm7, %ymm13, %ymm7
+
+	jmp				0f
+
+1:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+
+	vbroadcastsd	16(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm5, %ymm5
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r11d
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm6, %ymm6
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm12
+	vsubpd			%ymm12, %ymm7, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm7, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+	// solve top-left
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		0(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		32(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		64(%r10), %ymm12
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		64(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm0, %ymm0
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm1, %ymm1
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm2, %ymm2
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm3, %ymm3
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		96(%r10, %r11, 1), %ymm14
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm14, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	addq		$128, %r10
+
+
+	// solve top-left
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r10, %r11, 1), %ymm12
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		32(%r10, %r11, 1), %ymm12
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x00, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x00, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x00, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x00, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0xf, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+	vmovapd		64(%r10, %r11, 1), %ymm12
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x11, %ymm4, %ymm4, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm4, %ymm4
+	vperm2f128	$0x11, %ymm5, %ymm5, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm5, %ymm5
+	vperm2f128	$0x11, %ymm6, %ymm6, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm6, %ymm6
+	vperm2f128	$0x11, %ymm7, %ymm7, %ymm13
+	vpermilpd	$0x0, %ymm13, %ymm13
+	vmulpd		%ymm12, %ymm13, %ymm15
+	vsubpd		%ymm15, %ymm7, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+	
+	// bottom-right
+
+	vmovapd			224(%r10, %r11, 1), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r10, %r11, 1), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r10, %r11, 1), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	// bottom-right
+
+	cmpl	$7, %r13d
+	jle		0f
+
+	vmovapd			224(%r10, %r11, 1), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+0:
+	cmpl	$6, %r13d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r10, %r11, 1), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+1:
+	cmpl	$5, %r13d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r10, %r11, 1), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+2:
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vmulpd			%ymm11, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd			%ymm0, %ymm12
+	vdivsd			%xmm0, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r10)
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vblendpd		$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+//	vpermpd			$0x00, %ymm1, %ymm13
+	vmovddup		%xmm1, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm5, %ymm5
+	vblendpd		$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r10)
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vblendpd		$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+//	vpermpd			$0x00, %ymm2, %ymm13
+	vmovddup		%xmm2, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vblendpd		$0x2, %ymm2, %ymm13, %ymm12
+
+//	vpermpd			$0x55, %ymm2, %ymm13
+	vperm2f128		$0x00, %ymm2, %ymm2, %ymm13
+	vpermilpd		$0xf, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm6, %ymm6
+	vblendpd		$0x4, %ymm2, %ymm12, %ymm12
+
+//	vpermpd			$0xaa, %ymm2, %ymm13
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm13
+	vpermilpd		$0x0, %ymm13, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r10)
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vblendpd		$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+//	vpermpd			$0x00, %ymm3, %ymm13
+	vmovddup		%xmm3, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x2, %ymm3, %ymm13, %ymm12
+
+//	vpermpd			$0x55, %ymm3, %ymm13
+	vperm2f128		$0x00, %ymm3, %ymm3, %ymm13
+	vpermilpd		$0xf, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x4, %ymm3, %ymm12, %ymm12
+
+//	vpermpd			$0xaa, %ymm3, %ymm13
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
+	vpermilpd		$0x0, %ymm11, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vsubpd			%ymm15, %ymm7, %ymm7
+	vblendpd		$0x8, %ymm3, %ymm12, %ymm12
+	
+//	vpermpd			$0xff, %ymm3, %ymm13
+//	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
+	vpermilpd		$0xf, %ymm11, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovddup		%xmm13, %xmm13
+	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r10)
+//	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vblendpd		$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4,  0(%r10, %r11, 1)
+	vmovapd %ymm5, 32(%r10, %r11, 1)
+	vmovapd %ymm6, 64(%r10, %r11, 1)
+	vmovapd %ymm7, 96(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,   0(%r10)
+	vmovapd %ymm1,  32(%r10)
+	vmovapd %ymm2,  64(%r10)
+	vmovapd %ymm3,  96(%r10)
+
+	vmovapd %ymm4, 128(%r10)
+	vmovapd %ymm5, 160(%r10)
+	vmovapd %ymm6, 192(%r10)
+	vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	jl			0f // end
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,   0(%r10)
+	vmaskmovpd	%ymm1, %ymm15,  32(%r10)
+	vmaskmovpd	%ymm2, %ymm15,  64(%r10)
+	vmaskmovpd	%ymm3, %ymm15,  96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0,0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmovapd		%ymm4, 0(%r10, %r11, 1)
+	vmovapd		%ymm5, 32(%r10, %r11, 1)
+	vmovapd		%ymm6, 64(%r10, %r11, 1)
+	vmovapd		%ymm7, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	jl			4f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		4f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+	cmpl		$2, %r15d
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	jl			4f // end
+	cmpl		$3, %r15d
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm13
+#endif
+
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x4, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x2, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7          8        9          10
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4          5        6             7          8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+//                                  rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4          5        6             7          8          9       10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // km
+	movq	ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16        rsp+24     rsp+32   rsp+40     rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // C
+	movq	ARG10, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8        r9         rsp+8    rsp+16        rsp+24    rsp+32     rsp+40   rsp+48    rsp+56     rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                    rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16     rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16       rsp+24     rsp+32   rsp+40  rsp+48  rsp+56  rsp+64
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10 // A
+	movq	ARG4, %r11 // sda
+	sall	$5, %r11d // 4*sda*sizeof(double)
+	movq	ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24              rsp+32  rsp+40 
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                           rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56              rsp+64  rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               rdi     rsi         rdx       rcx          r8     r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64              rsp+72  rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                            rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32  rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40              rsp+48  rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48  tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48              rsp+56  rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG13, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12  // km
+	movq	ARG14, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	// epilogue
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                             1         2           3           4           5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_8_lib4
+	.type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_8_lib4
+	.def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	//
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm0, %ymm0
+	vaddpd			%ymm14, %ymm4, %ymm4
+	//
+	vmovapd			32(%r11), %ymm12
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm1, %ymm1
+	vaddpd			%ymm14, %ymm5, %ymm5
+	vbroadcastsd	32(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	//
+	vmovapd			64(%r11), %ymm12
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm2, %ymm2
+	vaddpd			%ymm14, %ymm6, %ymm6
+	vbroadcastsd	64(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	72(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	//
+	vmovapd			96(%r11), %ymm12
+	vmovapd			96(%r11, %r12, 1), %ymm14
+	vaddpd			%ymm12, %ymm3, %ymm3
+	vaddpd			%ymm14, %ymm7, %ymm7
+	vbroadcastsd	96(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm4, %ymm4
+	vbroadcastsd	104(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	112(%r13), %ymm13
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm13, %ymm14, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm6, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	72(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm5, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm3, %ymm3
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm7, %ymm7
+	vbroadcastsd	64(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm2, %ymm2
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm6, %ymm6
+	vbroadcastsd	32(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm4, %ymm12, %ymm15
+	vaddpd			%ymm15, %ymm5, %ymm5
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+	movq	ARG5, %r13 // sdd
+	sall	$5, %r13d
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vaddpd			%ymm14, %ymm4, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	//
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vaddpd			%ymm14, %ymm5, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+	//
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vaddpd			%ymm14, %ymm6, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+	//
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm4, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	104(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm5, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vbroadcastsd	112(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm12, %ymm12
+	vmulpd			%ymm6, %ymm13, %ymm15
+	vaddpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vaddpd			%ymm14, %ymm7, %ymm14
+	vmovapd			%ymm12, 96(%r12)
+	vmovapd			%ymm14, 96(%r12, %r13, 1)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_diag_lib4.c b/kernel/avx/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..d64f977
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,866 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	b_33 = _mm256_broadcast_sd( &B[3] );
+	b_33 = _mm256_mul_pd( b_33, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		A += 4*sda;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+		_mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	b_33 = _mm256_broadcast_sd( &B[3] );
+	b_33 = _mm256_mul_pd( b_33, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+		a_00 = _mm256_load_pd( &A[12] );
+		d_03 = _mm256_mul_pd( a_00, b_33 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+		_mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22,
+		c_00,
+		d_00, d_01, d_02;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	b_22 = _mm256_broadcast_sd( &B[2] );
+	b_22 = _mm256_mul_pd( b_22, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+		a_00 = _mm256_load_pd( &A[8] );
+		d_02 = _mm256_mul_pd( a_00, b_22 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+		_mm256_maskstore_pd( &D[8], mask_i, d_02 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11,
+		c_00,
+		d_00, d_01;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	b_00 = _mm256_mul_pd( b_00, alpha0 );
+	b_11 = _mm256_broadcast_sd( &B[1] );
+	b_11 = _mm256_mul_pd( b_11, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		a_00 = _mm256_load_pd( &A[4] );
+		d_01 = _mm256_mul_pd( a_00, b_11 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+		_mm256_maskstore_pd( &D[4], mask_i, d_01 );
+
+		}
+
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	b_00 = _mm256_broadcast_sd( &B[0] );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+		double m_f = kmax-k;
+
+		mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+		a_00 = _mm256_load_pd( &A[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_maskstore_pd( &D[0], mask_i, d_00 );
+
+		}
+	
+	}
+
+
+
+// A is the diagonal of a matrix, beta=0.0 case
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0,
+		sign,
+		a_00,
+		b_00,
+		d_00, d_01, d_02, d_03;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		B += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_store_pd( &D[0], d_00 );
+		_mm256_store_pd( &D[4], d_01 );
+		_mm256_store_pd( &D[8], d_02 );
+		_mm256_store_pd( &D[12], d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_store_pd( &D[0], d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256i
+		mask;
+
+	__m256d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	mask = _mm256_set_epi64x( 1, -1, -1, -1 );
+		
+	alpha0 = _mm256_broadcast_sd( alpha );
+	beta0  = _mm256_broadcast_sd( beta );
+	
+	a_00 = _mm256_load_pd( &A[0] );
+	a_00 = _mm256_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[4] );
+		d_01 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[8] );
+		d_02 = _mm256_mul_pd( a_00, b_00 );
+		b_00 = _mm256_load_pd( &B[12] );
+		d_03 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+		c_00 = _mm256_load_pd( &C[4] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_01 = _mm256_add_pd( c_00, d_01 );
+		c_00 = _mm256_load_pd( &C[8] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_02 = _mm256_add_pd( c_00, d_02 );
+		c_00 = _mm256_load_pd( &C[12] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_03 = _mm256_add_pd( c_00, d_03 );
+
+		_mm256_maskstore_pd( &D[0], mask, d_00 );
+		_mm256_maskstore_pd( &D[4], mask, d_01 );
+		_mm256_maskstore_pd( &D[8], mask, d_02 );
+		_mm256_maskstore_pd( &D[12], mask, d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm256_load_pd( &B[0] );
+		d_00 = _mm256_mul_pd( a_00, b_00 );
+
+		c_00 = _mm256_load_pd( &C[0] );
+		c_00 = _mm256_mul_pd( c_00, beta0 );
+		d_00 = _mm256_add_pd( c_00, d_00 );
+
+		_mm256_maskstore_pd( &D[0], mask, d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	}
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m128d
+		alpha0, beta0,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00, d_01, d_02, d_03;
+		
+	alpha0 = _mm_loaddup_pd( alpha );
+	beta0  = _mm_loaddup_pd( beta );
+	
+	a_00 = _mm_load_pd( &A[0] );
+	a_00 = _mm_mul_pd( a_00, alpha0 );
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_00 = _mm_load_pd( &B[0] );
+		d_00 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[4] );
+		d_01 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[8] );
+		d_02 = _mm_mul_pd( a_00, b_00 );
+		b_00 = _mm_load_pd( &B[12] );
+		d_03 = _mm_mul_pd( a_00, b_00 );
+
+		c_00 = _mm_load_pd( &C[0] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_00 = _mm_add_pd( c_00, d_00 );
+		c_00 = _mm_load_pd( &C[4] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_01 = _mm_add_pd( c_00, d_01 );
+		c_00 = _mm_load_pd( &C[8] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_02 = _mm_add_pd( c_00, d_02 );
+		c_00 = _mm_load_pd( &C[12] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_03 = _mm_add_pd( c_00, d_03 );
+
+		_mm_store_pd( &D[0], d_00 );
+		_mm_store_pd( &D[4], d_01 );
+		_mm_store_pd( &D[8], d_02 );
+		_mm_store_pd( &D[12], d_03 );
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_00 = _mm_load_pd( &B[0] );
+		d_00 = _mm_mul_pd( a_00, b_00 );
+
+		c_00 = _mm_load_pd( &C[0] );
+		c_00 = _mm_mul_pd( c_00, beta0 );
+		d_00 = _mm_add_pd( c_00, d_00 );
+
+		_mm_store_pd( &D[0], d_00 );
+
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+
+	
+	}
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = A[0] * alpha0;
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+
+
+
diff --git a/kernel/avx/kernel_dgemv_12_lib4.S b/kernel/avx/kernel_dgemv_12_lib4.S
new file mode 100644
index 0000000..c51ad9a
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_12_lib4.S
@@ -0,0 +1,1322 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_12_lib4, @function
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r14 // A1 <- A0
+	addq	%r12, %r14 // A1 <- A0 + 4*sda*sizeof(double)
+	movq	%r14, %r15 // A2 <- A1
+	addq	%r12, %r15 // A2 <- A1 + 4*sda*sizeof(double)
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	0(%r15) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+	prefetcht0	64(%r15) // software prefetch
+
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	128(%r14) // software prefetch
+	prefetcht0	128(%r15) // software prefetch
+
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	
+	subl	$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmovapd	32(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmovapd	32(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	192(%r14) // software prefetch
+	prefetcht0	192(%r15) // software prefetch
+
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	64(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	64(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vbroadcastsd	24(%r13), %ymm12
+	addq	$32, %r13 // x+4
+	vmovapd	96(%r11), %ymm8
+	addq	$128, %r11 // A0+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmovapd	96(%r14), %ymm8
+	addq	$128, %r14 // A1+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmovapd	96(%r15), %ymm8
+	addq	$128, %r15 // A2+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r14), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	
+	addq	$32, %r11
+	addq	$32, %r14
+	addq	$32, %r15
+	addq	$8, %r13
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_12_lib4, .-inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_12_lib4, @function
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	256(%r11) // software prefetch
+	prefetcht0	320(%r11) // software prefetch
+
+	jl		0f // clean-up loop
+
+	movq	%r11, %r14
+	addq	%r12, %r14 // A+bs*sda
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+
+	vmovupd	0(%r13), %ymm12
+	addq	$32, %r13 // x+4
+
+	vmovapd	0(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	prefetcht0	64(%r14) // software prefetch
+
+	vmovapd	64(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	prefetcht0	128(%r14) // software prefetch
+
+	vmovapd	128(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r14) // software prefetch
+
+	vmovapd	192(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	prefetcht0	256(%r14) // software prefetch
+
+	vmovapd	256(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm8, %ymm15, %ymm8
+	
+	vmovapd	288(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm9, %ymm15, %ymm9
+	
+	prefetcht0	320(%r14) // software prefetch
+
+	vmovapd	320(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm10, %ymm15, %ymm10
+
+	vmovapd	352(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm11, %ymm15, %ymm11
+	
+//	addq	%r12, %r11 // A+bs*sda
+	movq	%r14, %r11 // A+bs*sda
+	addq	%r12, %r14 // A+bs*sda+bs*sda
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	vmovapd	128(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	vmovapd	192(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	vmovapd	256(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm8, %ymm15, %ymm8
+	
+	vmovapd	288(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm9, %ymm15, %ymm9
+	
+	vmovapd	320(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm10, %ymm15, %ymm10
+
+	vmovapd	352(%r11), %ymm13
+	vmulpd	%ymm13, %ymm12, %ymm15
+	vaddpd	%ymm11, %ymm15, %ymm11
+
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_12_lib4, .-inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_12_lib4, @function
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm1, %ymm4, %ymm1
+	vaddpd	%ymm2, %ymm5, %ymm2
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm2, %ymm15, %ymm2
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovupd		64(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_12_lib4, .-inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_12_lib4, @function
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm9, %ymm8, %ymm8
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vhaddpd	%ymm11, %ymm10, %ymm10
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x2, %ymm8, %ymm10, %ymm9
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vperm2f128	$0x13, %ymm8, %ymm10, %ymm8
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	vaddpd	%ymm8, %ymm9, %ymm2
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm2, %ymm15, %ymm2
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	vmovupd		64(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm2, %ymm14, %ymm2
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_12_lib4, .-inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z8 z9 za zb]_a
+// ymm3  <- [z0 z1 z2 z3]_b
+// ymm4  <- [z4 z5 z6 z7]_b
+// ymm5  <- [z8 z9 za zb]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]
+// ymm1  <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3  <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_N_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_n_12_lib4, @function
+inner_blender_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm1, %ymm4, %ymm1
+	vaddpd	%ymm2, %ymm5, %ymm2
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vsubpd		%ymm2, %ymm15, %ymm2
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_n_12_lib4, .-inner_blender_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- [z8a z8b z8c z8d]
+// ymm9  <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0  <- [z0 z1 z2 z3]
+// ymm1  <- [z4 z5 z6 z7]
+// ymm2  <- [z8 z9 za zb]
+// ymm3  <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_T_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_t_12_lib4, @function
+inner_blender_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm9, %ymm8, %ymm8
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vhaddpd	%ymm11, %ymm10, %ymm10
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x2, %ymm8, %ymm10, %ymm9
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vperm2f128	$0x13, %ymm8, %ymm10, %ymm8
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	vaddpd	%ymm8, %ymm9, %ymm2
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r11), %ymm15
+	vsubpd		%ymm2, %ymm15, %ymm2
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_t_12_lib4, .-inner_blender_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_12_lib4, @function
+inner_store_12_lib4:
+#elif defined(OS_MAC)
+_inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	vmovupd %ymm2, 64(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_12_lib4, .-inner_store_12_lib4
+#endif
+#endif
+
+
+
+
+
+//                             rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_12_lib4
+	.type kernel_dgemv_n_12_lib4, @function
+kernel_dgemv_n_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_12_lib4
+_kernel_dgemv_n_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_12_lib4
+	.def kernel_dgemv_n_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+	// call inner blender n
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_12_lib4, .-kernel_dgemv_n_12_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi           rdx         rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_12_lib4
+	.type kernel_dgemv_t_12_lib4, @function
+kernel_dgemv_t_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_12_lib4
+_kernel_dgemv_t_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_12_lib4
+	.def kernel_dgemv_t_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_12_lib4, .-kernel_dgemv_t_12_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_4_lib4.S b/kernel/avx/kernel_dgemv_4_lib4.S
new file mode 100644
index 0000000..656e220
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_4_lib4.S
@@ -0,0 +1,4503 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_4_lib4, @function
+inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovapd	0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	$128, %r11
+	addq	$32, %r12
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vmovapd	0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	addq	$32, %r11
+	addq	$8, %r12
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_4_lib4, .-inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_4_lib4, @function
+inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmaskmovpd	0(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmaskmovpd	32(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmaskmovpd	64(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmaskmovpd	96(%r11), %ymm14, %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_4_lib4, .-inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_nt_4_lib4, @function
+inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+	vmovupd	0(%r14), %ymm13
+
+	vmovapd	0(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	64(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovapd	96(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm11
+
+	vmaskmovpd	0(%r13), %ymm11, %ymm12
+	vmaskmovpd	0(%r14), %ymm11, %ymm13
+
+//	vmovupd	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	0(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	32(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	64(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	96(%r11), %ymm11, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+		
+//	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	%ymm13, %ymm11, 0(%r14)
+
+	sall	$3, %r10d // *sizeof(double)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_nt_4_lib4, .-inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemv_add_t_4_lib4, @function
+inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$3, %r15d // offA*sizeof(double)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2sd	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2sd	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm13, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+	vandpd		%ymm15, %ymm14, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$4, %r10d // kmax - (4-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemv_add_t_4_lib4, .-inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+	vmovupd		0(%r13), %ymm12
+	vmovupd		0(%r14), %ymm13
+
+	vmovupd		0(%r11), %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		32(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		64(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmovupd		96(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	subq	$4, %r10
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_gen_lib4, @function
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#endif
+#endif
+
+	movl	$4, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2sd	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm11
+
+	vmaskmovpd	0(%r13), %ymm11, %ymm12
+	vmaskmovpd	0(%r14), %ymm11, %ymm13
+
+	vmaskmovpd	0(%r11), %ymm11, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	32(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	64(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmaskmovpd	96(%r11), %ymm11, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovpd	%ymm13, %ymm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$3, %rax // *sizeof(double)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_gen_lib4, .-inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_lib4, @function
+inner_blend_n_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_lib4; .scl 2; .type 32; .endef
+inner_blend_n_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_lib4, .-inner_blend_n_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_lib4, @function
+inner_blend_t_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_lib4; .scl 2; .type 32; .endef
+inner_blend_t_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_lib4, .-inner_blend_t_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_4_lib4, @function
+inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_4_lib4, .-inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_m11_4_lib4, @function
+inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm1, %ymm0
+	vaddpd	%ymm2, %ymm3, %ymm2
+	vaddpd	%ymm0, %ymm2, %ymm0
+
+	// beta
+	vmovupd		0(%r10), %ymm14
+	vsubpd		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_m11_4_lib4, .-inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_4_lib4, @function
+inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_4_lib4, .-inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vmovupd		0(%r11), %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_4_lib4, @function
+inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	vmovupd		0(%r10), %ymm14
+	vsubpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_4_lib4, .-inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_ln_inv_4_lib4, @function
+inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+	vmovapd			0(%r10), %ymm13
+	vblendpd		$0x1, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vmovapd			32(%r10), %ymm13
+	vblendpd		$0x3, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x3, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vmovapd			64(%r10), %ymm13
+	vblendpd		$0x7, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_ln_inv_4_lib4, .-inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS, variable size version
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_ln_inv_4_vs_lib4, @function
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+	vmovapd			0(%r10), %ymm13
+	vblendpd		$0x1, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+	vmovapd			32(%r10), %ymm13
+	vblendpd		$0x3, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x3, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+	vmovapd			64(%r10), %ymm13
+	vblendpd		$0x7, %ymm14, %ymm13, %ymm13
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm13, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	// return
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_ln_inv_4_vs_lib4, .-inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_4_lib4, @function
+inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vmovsd			88(%r10), %xmm11
+	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_4_lib4, .-inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_3_lib4, @function
+inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_3_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vmovsd			88(%r10), %xmm11
+	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+
+//	vbroadcastsd	24(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0x8, %ymm12, %ymm0, %ymm0
+	
+	cmpl			$4, %r12d
+	jl				0f
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+0:
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_3_lib4, .-inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_2_lib4, @function
+inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_2_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	cmpl			$3, %r12d
+
+	vmovapd			16(%r10), %xmm12
+	vmovapd			48(%r10), %xmm13
+	vunpcklpd		%xmm13, %xmm12, %xmm9
+	vblendpd		$0xc, %ymm14, %ymm9, %ymm9
+	vunpckhpd		%xmm13, %xmm12, %xmm10
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+//	vmovsd			88(%r10), %xmm11
+//	vinsertf128		$0x1, %xmm11, %ymm10, %ymm10
+//	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+	vblendpd		$0xc, %ymm14, %ymm10, %ymm10
+
+//	vbroadcastsd	24(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0xc, %ymm12, %ymm0, %ymm0
+	
+	je				0f
+	jl				1f
+
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+
+//	vbroadcastsd	16(%r11), %ymm12
+//	vmulpd			%ymm12, %ymm0, %ymm1
+//	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm0, %ymm0
+
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_2_lib4, .-inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope 
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- k
+// r13  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsv_lt_inv_1_lib4, @function
+inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_1_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovupd			0(%r13), %ymm12
+	vblendpd		$0xe, %ymm12, %ymm0, %ymm0
+	
+	cmpl			$3, %r12d
+	je				0f
+
+	cmpl			$2, %r12d
+	je				1f
+	jl				2f
+
+	vmovsd			24(%r10), %xmm10
+	vblendpd		$0xe, %ymm14, %ymm10, %ymm10
+	vpermilpd		$0xf, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm10, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+0:
+
+	vmovsd			16(%r10), %xmm9
+	vblendpd		$0xe, %ymm14, %ymm9, %ymm9
+	vpermilpd		$0x0, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm9, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+
+1:
+
+	vmovsd			8(%r10), %xmm8
+	vblendpd		$0xe, %ymm14, %ymm8, %ymm8
+	vpermilpd		$0x3, %ymm0, %ymm12
+//	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vsubpd			%ymm15, %ymm0, %ymm0
+//	vbroadcastsd	8(%r11), %ymm12
+
+2:
+
+	vmovsd			0(%r11), %xmm12
+	vmulpd			%ymm12, %ymm0, %ymm1
+	vblendpd		$0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsv_lt_inv_1_lib4, .-inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k-4
+// r11   <- A+4*4*sizeof(double)
+// r12   <- x+4*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmv_un_4_lib4, @function
+inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r11), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	
+	subl			$4, %r10d
+
+	vmovapd			32(%r11), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	
+	vmovapd			64(%r11), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	addq			$128, %r11
+	addq			$32, %r12
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmv_un_4_lib4, .-inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dtrmv_ut_4_lib4, @function
+inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dtrmv_ut_4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jle		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$4, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+//	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+//	vmovupd		LC02(%rip), %ymm13
+#endif
+//	vmovddup	%xmm14, %xmm14
+//	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+//	vsubpd		%ymm14, %ymm13, %ymm14
+//
+//	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovupd		0(%r13), %ymm12
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd		0(%r11), %ymm8
+	vblendpd	$0x1, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vblendpd	$0x3, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	
+	vmovapd		64(%r11), %ymm8
+	vblendpd	$0x7, %ymm8, %ymm14, %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+
+	vmovapd		96(%r11), %ymm8
+	vmulpd		%ymm8, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+		
+	sall		$3, %r10d
+//	movslq		%r10d, %r10
+	addq		%r10, %r11
+	addq		%r10, %r13
+	xorl		%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dtrmv_ut_4_lib4, .-inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_lib4, @function
+inner_store_4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_lib4; .scl 2; .type 32; .endef
+inner_store_4_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_lib4, .-inner_store_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_vs_lib4, @function
+inner_store_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_vs_lib4, .-inner_store_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_gen_lib4, @function
+inner_store_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r11d, %xmm14, %xmm14
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_gen_lib4, .-inner_store_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4          5             6          7
+// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_lib4
+	.type kernel_dgemv_n_4_lib4, @function
+kernel_dgemv_n_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_lib4
+_kernel_dgemv_n_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_lib4
+	.def kernel_dgemv_n_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_lib4, .-kernel_dgemv_n_4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7          8
+// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_vs_lib4
+	.type kernel_dgemv_n_4_vs_lib4, @function
+kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_vs_lib4
+_kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_vs_lib4
+	.def kernel_dgemv_n_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_vs_lib4, .-kernel_dgemv_n_4_vs_lib4
+#endif
+
+
+
+
+
+//                                1      2              3          4          5             6          7          8       9
+// void kernel_dgemv_n_4_gen_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_4_gen_lib4
+	.type kernel_dgemv_n_4_gen_lib4, @function
+kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_4_gen_lib4
+_kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_4_gen_lib4
+	.def kernel_dgemv_n_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k0 
+	movq	ARG9, %r12 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_4_gen_lib4, .-kernel_dgemv_n_4_gen_lib4
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_lib4
+	.type kernel_dgemv_t_4_lib4, @function
+kernel_dgemv_t_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_lib4
+_kernel_dgemv_t_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_lib4
+	.def kernel_dgemv_t_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_lib4, .-kernel_dgemv_t_4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_vs_lib4
+	.type kernel_dgemv_t_4_vs_lib4, @function
+kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_vs_lib4
+_kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_vs_lib4
+	.def kernel_dgemv_t_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_vs_lib4, .-kernel_dgemv_t_4_vs_lib4
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_4_gen_lib4
+	.type kernel_dgemv_t_4_gen_lib4, @function
+kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_4_gen_lib4
+_kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_4_gen_lib4
+	.def kernel_dgemv_t_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_4_gen_lib4, .-kernel_dgemv_t_4_gen_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3                   4          5          6
+// void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_ln_inv_4_lib4
+	.type kernel_dtrsv_ln_inv_4_lib4, @function
+kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_ln_inv_4_lib4
+_kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_ln_inv_4_lib4
+	.def kernel_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_ln_inv_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_ln_inv_4_lib4, .-kernel_dtrsv_ln_inv_4_lib4
+#endif
+
+
+
+
+
+//                                    1      2          3                   4          5          6          7       8
+// void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_ln_inv_4_vs_lib4
+	.type kernel_dtrsv_ln_inv_4_vs_lib4, @function
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_ln_inv_4_vs_lib4
+_kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_ln_inv_4_vs_lib4
+	.def kernel_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+	movq	%r11, %r13
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_ln_inv_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+	// store vs
+
+	movq	ARG6, %r10 // z 
+	movq	ARG7, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_ln_inv_4_vs_lib4, .-kernel_dtrsv_ln_inv_4_vs_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_4_lib4
+	.type kernel_dtrsv_lt_inv_4_lib4, @function
+kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_4_lib4
+_kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_4_lib4
+	.def kernel_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_4_lib4, .-kernel_dtrsv_lt_inv_4_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8   
+// void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_3_lib4
+	.type kernel_dtrsv_lt_inv_3_lib4, @function
+kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_3_lib4
+_kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_3_lib4
+	.def kernel_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_3_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_3_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$3, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_3_lib4, .-kernel_dtrsv_lt_inv_3_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8 
+// void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_2_lib4
+	.type kernel_dtrsv_lt_inv_2_lib4, @function
+kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_2_lib4
+_kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_2_lib4
+	.def kernel_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_2_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_2_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$2, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_2_lib4, .-kernel_dtrsv_lt_inv_2_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi        rdx      rcx                 r8         r9         rsp+8 
+// void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsv_lt_inv_1_lib4
+	.type kernel_dtrsv_lt_inv_1_lib4, @function
+kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsv_lt_inv_1_lib4
+_kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsv_lt_inv_1_lib4
+	.def kernel_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_1_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	addq	%r12, %r11 // A+4*sda*sizeof(double)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+4 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG1, %r12 // k
+	movq	ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsv_lt_inv_1_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	$1, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsv_lt_inv_1_lib4, .-kernel_dtrsv_lt_inv_1_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi        rdx        rcx
+// void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_un_4_lib4
+	.type kernel_dtrmv_un_4_lib4, @function
+kernel_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_un_4_lib4
+_kernel_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_un_4_lib4
+	.def kernel_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dtrmv edge & dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmv_un_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG4, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_un_4_lib4, .-kernel_dtrmv_un_4_lib4
+#endif
+
+
+
+
+
+//                             rdi    rsi        rdx      rcx        r8
+// void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_ut_4_lib4
+	.type kernel_dtrmv_ut_4_lib4, @function
+kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_ut_4_lib4
+_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_ut_4_lib4
+	.def kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+	// call inner blend t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_ut_4_lib4, .-kernel_dtrmv_ut_4_lib4
+#endif
+
+
+
+
+
+//                                rdi    rsi        rdx      rcx        r8         r9
+// void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *y, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_ut_4_vs_lib4
+	.type kernel_dtrmv_ut_4_vs_lib4, @function
+kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_ut_4_vs_lib4
+_kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_ut_4_vs_lib4
+	.def kernel_dtrmv_ut_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+	// call inner blend t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z 
+	movq	ARG6, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_ut_4_vs_lib4, .-kernel_dtrmv_ut_4_vs_lib4
+#endif
+
+
+
+
+
+//                             1      2                3                4          5        6            7            8               9            10           11
+// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_4_lib4
+	.type kernel_dgemv_nt_4_lib4, @function
+kernel_dgemv_nt_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_4_lib4
+_kernel_dgemv_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_4_lib4
+	.def kernel_dgemv_nt_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_4_lib4, .-kernel_dgemv_nt_4_lib4
+#endif
+
+
+
+
+
+//                                1      2                3                4          5        6            7            8               9            10           11           12
+// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_4_vs_lib4
+	.type kernel_dgemv_nt_4_vs_lib4, @function
+kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_4_vs_lib4
+_kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_4_vs_lib4
+	.def kernel_dgemv_nt_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG12, %r11 // km
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+	movq	ARG12, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_4_vs_lib4, .-kernel_dgemv_nt_4_vs_lib4
+#endif
+
+
+
+
+
+//                            1      2              3          4        5           6
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_lib4
+	.type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_lib4
+	.def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6           7
+// void kernel_dsymv_l_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_gen_lib4
+	.type kernel_dsymv_l_4_gen_lib4, @function
+kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_gen_lib4
+_kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_gen_lib4
+	.def kernel_dsymv_l_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_gen_lib4, .-kernel_dsymv_l_4_gen_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_8_lib4.S b/kernel/avx/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..53d371e
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1575 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	0(%r15) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	64(%r15) // software prefetch
+
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	128(%r15) // software prefetch
+
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	subl	$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmovapd	32(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	prefetcht0	192(%r11) // software prefetch
+	prefetcht0	192(%r15) // software prefetch
+
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	64(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+
+	vbroadcastsd	24(%r13), %ymm12
+	addq	$32, %r13 // x+4
+	vmovapd	96(%r11), %ymm8
+	addq	$128, %r11 // A0+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmovapd	96(%r15), %ymm8
+	addq	$128, %r15 // A1+4*bs
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmovapd	0(%r15), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	addq	$32, %r11
+	addq	$32, %r15
+	addq	$8, %r13
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// r14   <- dirty
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+
+	prefetcht0	0(%r11) // software prefetch
+	prefetcht0	64(%r11) // software prefetch
+	prefetcht0	128(%r11) // software prefetch
+	prefetcht0	192(%r11) // software prefetch
+
+	jl		0f // clean-up loop
+
+	movq	%r11, %r14
+	addq	%r12, %r14 // A+bs*sda
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+
+	vmovupd	0(%r13), %ymm12
+	addq	$32, %r13 // x+4
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	prefetcht0	64(%r14) // software prefetch
+
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+
+	prefetcht0	128(%r14) // software prefetch
+
+	vmovapd	128(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	prefetcht0	192(%r14) // software prefetch
+
+	vmovapd	192(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+	
+//	addq	%r12, %r11 // A+bs*sda
+	movq	%r14, %r11 // A+bs*sda
+	addq	%r12, %r14 // A+bs*sda+bs*sda
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+		
+	vmovapd	128(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	
+	vmovapd	192(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm6, %ymm15, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm7, %ymm15, %ymm7
+
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k-4
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+	
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	// first 4 columns
+	vmovapd			0(%r11), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	
+	subl			$4, %r10d
+
+	vmovapd			32(%r11), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	8(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	
+	vmovapd			64(%r11), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	16(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r13), %ymm12
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+
+
+
+	// last 4 columns
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r15), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	
+	subl			$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd			32(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmovapd			32(%r15), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd			64(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			64(%r15), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+
+	vbroadcastsd	24(%r13), %ymm12
+	vmovapd			96(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm2, %ymm15, %ymm2
+	vmovapd			96(%r15), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm3, %ymm15, %ymm3
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+	
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vmulpd		%ymm15, %ymm14, %ymm14
+	vaddpd		%ymm1, %ymm14, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+//                            rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_8_lib4
+	.type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_8_lib4
+	.def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+	// store
+
+	movq	ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi           rdx         rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_8_lib4
+	.type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_8_lib4
+	.def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+//                             rdi    rsi        rdx      rcx        r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_un_8_lib4
+	.type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_un_8_lib4
+	.def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dtrmv edge & dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blender n
+
+#if MACRO_LEVEL>=1
+	INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	ii = 1;
+	if(m>1)
+		{
+		tmp = pD[1+ps*0];
+		beta += tmp*tmp;
+		if(m>2)
+			{
+			tmp = pD[2+ps*0];
+			beta += tmp*tmp;
+			if(m>3)
+				{
+				tmp = pD[3+ps*0];
+				beta += tmp*tmp;
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		ii = 1;
+		if(m>1)
+			{
+			pD[1+ps*0] *= tmp;
+			if(m>2)
+				{
+				pD[2+ps*0] *= tmp;
+				if(m>3)
+					{
+					pD[3+ps*0] *= tmp;
+					}
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*0] *= tmp;
+			pD[1+ii*sdd+ps*0] *= tmp;
+			pD[2+ii*sdd+ps*0] *= tmp;
+			pD[3+ii*sdd+ps*0] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*0] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[0+ps*1];
+	w2 = pD[0+ps*2];
+	w3 = pD[0+ps*3];
+	if(m>1)
+		{
+		w1 += pD[1+ps*1] * pD[1+ps*0];
+		w2 += pD[1+ps*2] * pD[1+ps*0];
+		w3 += pD[1+ps*3] * pD[1+ps*0];
+		if(m>2)
+			{
+			w1 += pD[2+ps*1] * pD[2+ps*0];
+			w2 += pD[2+ps*2] * pD[2+ps*0];
+			w3 += pD[2+ps*3] * pD[2+ps*0];
+			if(m>3)
+				{
+				w1 += pD[3+ps*1] * pD[3+ps*0];
+				w2 += pD[3+ps*2] * pD[3+ps*0];
+				w3 += pD[3+ps*3] * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[0+ps*1] += w1;
+	pD[0+ps*2] += w2;
+	pD[0+ps*3] += w3;
+	if(m>1)
+		{
+		pD[1+ps*1] += w1 * pD[1+ps*0];
+		pD[1+ps*2] += w2 * pD[1+ps*0];
+		pD[1+ps*3] += w3 * pD[1+ps*0];
+		if(m>2)
+			{
+			pD[2+ps*1] += w1 * pD[2+ps*0];
+			pD[2+ps*2] += w2 * pD[2+ps*0];
+			pD[2+ps*3] += w3 * pD[2+ps*0];
+			if(m>3)
+				{
+				pD[3+ps*1] += w1 * pD[3+ps*0];
+				pD[3+ps*2] += w2 * pD[3+ps*0];
+				pD[3+ps*3] += w3 * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+		}
+	if(m==1)
+		return;
+	// second column
+	beta = 0.0;
+	if(m>2)
+		{
+		tmp = pD[2+ps*1];
+		beta += tmp*tmp;
+		if(m>3)
+			{
+			tmp = pD[3+ps*1];
+			beta += tmp*tmp;
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		if(m>2)
+			{
+			pD[2+ps*1] *= tmp;
+			if(m>3)
+				{
+				pD[3+ps*1] *= tmp;
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*1] *= tmp;
+			pD[1+ii*sdd+ps*1] *= tmp;
+			pD[2+ii*sdd+ps*1] *= tmp;
+			pD[3+ii*sdd+ps*1] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*1] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[1+ps*2];
+	w3 = pD[1+ps*3];
+	if(m>2)
+		{
+		w2 += pD[2+ps*2] * pD[2+ps*1];
+		w3 += pD[2+ps*3] * pD[2+ps*1];
+		if(m>3)
+			{
+			w2 += pD[3+ps*2] * pD[3+ps*1];
+			w3 += pD[3+ps*3] * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[1+ps*2] += w2;
+	pD[1+ps*3] += w3;
+	if(m>2)
+		{
+		pD[2+ps*2] += w2 * pD[2+ps*1];
+		pD[2+ps*3] += w3 * pD[2+ps*1];
+		if(m>3)
+			{
+			pD[3+ps*2] += w2 * pD[3+ps*1];
+			pD[3+ps*3] += w3 * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+		}
+	if(m==2)
+		return;
+	// third column
+	beta = 0.0;
+	if(m>3)
+		{
+		tmp = pD[3+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		if(m>3)
+			{
+			pD[3+ps*2] *= tmp;
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*2] *= tmp;
+			pD[1+ii*sdd+ps*2] *= tmp;
+			pD[2+ii*sdd+ps*2] *= tmp;
+			pD[3+ii*sdd+ps*2] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*2] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[2+ps*3];
+	if(m>3)
+		{
+		w3 += pD[3+ps*3] * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	w3 = - dD[2] * w3;
+	pD[2+ps*3] += w3;
+	if(m>3)
+		{
+		pD[3+ps*3] += w3 * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+		}
+	if(m==3)
+		return;
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*3] *= tmp;
+			pD[1+ii*sdd+ps*3] *= tmp;
+			pD[2+ii*sdd+ps*3] *= tmp;
+			pD[3+ii*sdd+ps*3] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*3] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp, w0;
+	double *pC00, *pC10, *pC01, *pC11;
+	int offset;
+	double *pD0 = pD-offD;
+	for(ii=0; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		jmax = m-ii-1;
+		jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		offset = 0;
+		jj = 0;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				tmp = pC10[0+offset];
+				beta += tmp*tmp;
+				offset += 1;
+				}
+			offset += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			tmp = pC10[1+offset];
+			beta += tmp*tmp;
+			tmp = pC10[2+offset];
+			beta += tmp*tmp;
+			tmp = pC10[3+offset];
+			beta += tmp*tmp;
+			offset += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			offset += 1;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			offset = 0;
+			jj = 0;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					pC10[0+offset] *= tmp;
+					offset += 1;
+					}
+				offset += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				pC10[0+offset] *= tmp;
+				pC10[1+offset] *= tmp;
+				pC10[2+offset] *= tmp;
+				pC10[3+offset] *= tmp;
+				offset += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				pC10[0+offset] *= tmp;
+				offset += 1;
+				}
+			pC00[0] = beta;
+			}
+		if(ii<n)
+			{
+			pC01 = pC00 + ps;
+			pC11 = pC10 + ps;
+			kmax = jmax;
+			kmax0 = jmax0;
+			jmax = n-ii-1;
+			jj = 0;
+			for( ; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ps*jj] * 1.0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+						offset += 1;
+						}
+					offset += -ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+					w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+					w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					offset += 1;
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ps*jj] += w0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+						offset += 1;
+						}
+					offset = offset-ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+					pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+					pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					offset += 1;
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 2;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[1+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pD[1+ps*0];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] = tmp;
+			tmp = pC[1+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pD[2+ps*0];
+				d1 = pD[2+ps*1];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * d0;
+				pW[0+ldw*1] += tmp * d1;
+				pW[0+ldw*2] = tmp;
+				tmp = pC[2+ps*1];
+				pW[1+ldw*0] += tmp * d0;
+				pW[1+ldw*1] += tmp * d1;
+				pW[1+ldw*2] = tmp;
+				if(m>3)
+					{
+					d0 = pD[3+ps*0];
+					d1 = pD[3+ps*1];
+					d2 = pD[3+ps*2];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * d0;
+					pW[0+ldw*1] += tmp * d1;
+					pW[0+ldw*2] += tmp * d2;
+					pW[0+ldw*3] = tmp;
+					tmp = pC[3+ps*1];
+					pW[1+ldw*0] += tmp * d0;
+					pW[1+ldw*1] += tmp * d1;
+					pW[1+ldw*2] += tmp * d2;
+					pW[1+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		pC[0+ps*1] -= pW[1+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+ps*0];
+			pW[0+ldw*1] = tmp;
+			if(m>2)
+				{
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * pD[2+ps*0];
+				pW[0+ldw*1] += tmp * pD[2+ps*1];
+				pW[0+ldw*2] = tmp;
+				if(m>3)
+					{
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * pD[3+ps*0];
+					pW[0+ldw*1] += tmp * pD[3+ps*1];
+					pW[0+ldw*2] += tmp * pD[3+ps*2];
+					pW[0+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		}
+
+	return;
+	}
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double c00, c01,
+	       c10, c11,
+	       c20, c21,
+	       c30, c31;
+	double a0, a1, a2, a3, b0, b1;
+	double tmp, d0, d1, d2, d3;
+	double *pC, *pW;
+	double pT[16];// = {};
+	int ldt = 4;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	__m256d
+		_w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+	ii = 0;
+#if 1
+	double alpha = 1.0;
+	double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for( ; ii<n-11; ii+=12)
+		{
+		kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+#endif
+	for( ; ii<n-7; ii+=8)
+		{
+		kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+		}
+	if(ii<n)
+		{
+//		kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+		kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+		}
+#else
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		_w0 = _mm256_setzero_pd();
+		_w1 = _mm256_setzero_pd();
+		_w2 = _mm256_setzero_pd();
+		_w3 = _mm256_setzero_pd();
+		for(jj=0; jj<m-3; jj+=4)
+			{
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			//
+			_d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			_d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w0 = _mm256_add_pd( _w0, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w1 = _mm256_add_pd( _w1, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w2 = _mm256_add_pd( _w2, _tp );
+			_t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+			_tp = _mm256_mul_pd( _d0, _t0 );
+			_w3 = _mm256_add_pd( _w3, _tp );
+			}
+		// TODO mask store
+		_mm256_storeu_pd( &pW[0+ps*0], _w0 );
+		_mm256_storeu_pd( &pW[0+ps*1], _w1 );
+		_mm256_storeu_pd( &pW[0+ps*2], _w2 );
+		_mm256_storeu_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ps*0] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ps*0] += d0 * tmp;
+				pW[1+ps*0] += d1 * tmp;
+				pW[2+ps*0] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ps*0] += d0 * tmp;
+					pW[1+ps*0] += d1 * tmp;
+					pW[2+ps*0] += d2 * tmp;
+					pW[3+ps*0] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ps*0] += d0 * tmp;
+			pW[1+ps*0] += d1 * tmp;
+			pW[2+ps*0] += d2 * tmp;
+			pW[3+ps*0] += d3 * tmp;
+			}
+		}
+#endif
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+		_w1 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+		_w2 = _mm256_mul_pd( _t0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+		_w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w1 = _mm256_add_pd( _w1, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w2 = _mm256_add_pd( _w2, _tp );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		_mm256_store_pd( &pW[0+ps*1], _w1 );
+		_mm256_store_pd( &pW[0+ps*2], _w2 );
+		_mm256_store_pd( &pW[0+ps*3], _w3 );
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+
+		// compute W^T *= T
+		_tz = _mm256_setzero_pd();
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*0] );
+		_tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+		_w0 = _mm256_mul_pd( _t0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*1] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+		_tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*2] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+		_tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_t0 = _mm256_load_pd( &pT[0+ldt*3] );
+		_t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+		_tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+		_tp = _mm256_mul_pd( _t0, _tp );
+		_w0 = _mm256_add_pd( _w0, _tp );
+
+		_mm256_store_pd( &pW[0+ps*0], _w0 );
+		}
+
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		c01 = pC[0+jj*sdc+ps*1];
+		c11 = pC[1+jj*sdc+ps*1];
+		c21 = pC[2+jj*sdc+ps*1];
+		c31 = pC[3+jj*sdc+ps*1];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*1];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*1];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*1];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		b1 = pW[3+ps*1];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		pC[0+jj*sdc+ps*1] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[1+jj*sdc+ps*1] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				pC[2+jj*sdc+ps*1] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					pC[3+jj*sdc+ps*1] = c31;
+					}
+				}
+			}
+		// load
+		c00 = pC[0+jj*sdc+ps*2];
+		c10 = pC[1+jj*sdc+ps*2];
+		c20 = pC[2+jj*sdc+ps*2];
+		c30 = pC[3+jj*sdc+ps*2];
+		c01 = pC[0+jj*sdc+ps*3];
+		c11 = pC[1+jj*sdc+ps*3];
+		c21 = pC[2+jj*sdc+ps*3];
+		c31 = pC[3+jj*sdc+ps*3];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*2];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ps*3];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*2];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ps*3];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*2];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ps*3];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*2];
+		c30 -= b0;
+		b1 = pW[3+ps*3];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*2] = c00;
+		pC[0+jj*sdc+ps*3] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*2] = c10;
+			pC[1+jj*sdc+ps*3] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*2] = c20;
+				pC[2+jj*sdc+ps*3] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*2] = c30;
+					pC[3+jj*sdc+ps*3] = c31;
+					}
+				}
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ps*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ps*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ps*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ps*0];
+		c30 -= b0;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					}
+				}
+			}
+		}
+
+#if 1
+	jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+	for(; jj<m-11; jj+=12)
+		{
+		kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+#endif
+	for(; jj<m-7; jj+=8)
+		{
+		kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+		}
+	for(; jj<m-3; jj+=4)
+		{
+		kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+		}
+	if(jj<m)
+		{
+		kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+		}
+#else
+	ii = 0;
+	for( ; ii<n-3; ii+=4)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			_c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+			_c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+			_c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+			_c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			//
+			_a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c0 = _mm256_sub_pd( _c0, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c1 = _mm256_sub_pd( _c1, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c2 = _mm256_sub_pd( _c2, _tp );
+			_b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+			_tp = _mm256_mul_pd( _a0, _b0 );
+			_c3 = _mm256_sub_pd( _c3, _tp );
+			// store
+			_mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+			_mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			c01 = pC[ll+jj*sdc+ps*1];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*1];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			pC[ll+jj*sdc+ps*1] = c01;
+			// load
+			c00 = pC[ll+jj*sdc+ps*2];
+			c01 = pC[ll+jj*sdc+ps*3];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[0+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[1+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[2+ps*3];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*2];
+			c00 -= a0*b0;
+			b1 = pW[3+ps*3];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*2] = c00;
+			pC[ll+jj*sdc+ps*3] = c01;
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pW = pW0+ii*ps;
+		pC = pC0+ii*ps;
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ps*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ps*0];
+			c00 -= a0*b0;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			}
+		}
+#endif
+
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		}
+	return;
+	}
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp;
+	double w00, w01,
+		   w10, w11,
+		   w20, w21,
+		   w30, w31;
+	__m256d
+		_a0, _b0, _t0, _w0, _w1;
+	double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+	double pT[4];
+	int ldt = 2;
+	double *pD0 = pD-offD;
+	ii = 0;
+#if 1 // rank 2
+	for(; ii<imax-1; ii+=2)
+		{
+		// first row
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		kmax = n-ii;
+		w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+			}
+		w00 = - w00*dD[ii];
+		pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+			}
+		// second row
+		pC11 = pC10+ps*1;
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ps*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC11[0+ps*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ps*jj] *= tmp;
+			}
+		// compute T
+		kmax = n-ii;
+		tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+		for(kk=2; kk<kmax; kk++)
+			tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+		pT[0+ldt*0] = - dD[ii+0];
+		pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldt*1] = - dD[ii+1];
+		// downgrade
+		kmax = n-ii;
+		jmax = m-ii-2;
+		jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		jj = 0;
+		pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+		pC20 = pC20a;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+				w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+					w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+					}
+				w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+				w00 = w00*pT[0+ldt*0];
+				pC20[0+ps*0] += w00*1.0          + w01*0.0;
+				pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+					}
+				pC20 += 1;
+				}
+			pC20 += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			//
+			_w0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _a0, _b0 );
+			_w0 = _mm256_add_pd( _w0, _t0 );
+			_w1 = _mm256_load_pd( &pC20[0+ps*1] );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w0 = _mm256_add_pd( _w0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _a0, _b0 );
+				_w1 = _mm256_add_pd( _w1, _t0 );
+				}
+			//
+			_b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+			_w1 = _mm256_mul_pd( _w1, _b0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_w1 = _mm256_add_pd( _w1, _t0 );
+			_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+			_w0 = _mm256_mul_pd( _w0, _b0 );
+			//
+			_a0 = _mm256_load_pd( &pC20[0+ps*0] );
+			_a0 = _mm256_add_pd( _a0, _w0 );
+			_mm256_store_pd( &pC20[0+ps*0], _a0 );
+			_a0 = _mm256_load_pd( &pC20[0+ps*1] );
+			_b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+			_t0 = _mm256_mul_pd( _w0, _b0 );
+			_a0 = _mm256_add_pd( _a0, _t0 );
+			_a0 = _mm256_add_pd( _a0, _w1 );
+			_mm256_store_pd( &pC20[0+ps*1], _a0 );
+			for(kk=2; kk<kmax; kk++)
+				{
+				_a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+				_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w0, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+				_t0 = _mm256_mul_pd( _w1, _b0 );
+				_a0 = _mm256_add_pd( _a0, _t0 );
+				_mm256_store_pd( &pC20[0+ps*kk], _a0 );
+				}
+			pC20 += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+			w00 = w00*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				}
+			pC20 += 1;
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		if(ii<n)
+			{
+			// compute T
+			pT[0+ldt*0] = - dD[ii+0];
+			// downgrade
+			kmax = n-ii;
+			jmax = m-ii-1;
+			jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+			jmax0 = jmax<jmax0 ? jmax : jmax0;
+			jj = 0;
+			pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+			pC10 = pC10a;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					w00 = pC10[0+ps*0];
+					for(kk=1; kk<kmax; kk++)
+						{
+						w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+						}
+					w00 = w00*pT[0+ldt*0];
+					pC10[0+ps*0] += w00;
+					for(kk=1; kk<kmax; kk++)
+						{
+						pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+						}
+					pC10 += 1;
+					}
+				pC10 += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				//
+				_w0 = _mm256_load_pd( &pC10[0+ps*0] );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _a0, _b0 );
+					_w0 = _mm256_add_pd( _w0, _t0 );
+					}
+				//
+				_b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+				_w0 = _mm256_mul_pd( _w0, _b0 );
+				//
+				_a0 = _mm256_load_pd( &pC10[0+ps*0] );
+				_a0 = _mm256_add_pd( _a0, _w0 );
+				_mm256_store_pd( &pC10[0+ps*0], _a0 );
+				for(kk=1; kk<kmax; kk++)
+					{
+					_a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+					_b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+					_t0 = _mm256_mul_pd( _w0, _b0 );
+					_a0 = _mm256_add_pd( _a0, _t0 );
+					_mm256_store_pd( &pC10[0+ps*kk], _a0 );
+					}
+				pC10 += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				w00 = pC10[0+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+					}
+				w00 = w00*pT[0+ldt*0];
+				pC10[0+ps*0] += w00;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+					}
+				pC10 += 1;
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+	{
+	const int ps = 4;
+	int kk;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	// 0
+	// 1
+	v10 =  pD[0+ps*1];
+	// 2
+	v10 += pD[1+ps*2]*pD[0+ps*2];
+	v20 =  pD[0+ps*2];
+	v21 =  pD[1+ps*2];
+	// 3
+	v10 += pD[1+ps*3]*pD[0+ps*3];
+	v20 += pD[2+ps*3]*pD[0+ps*3];
+	v21 += pD[2+ps*3]*pD[1+ps*3];
+	v30 =  pD[0+ps*3];
+	v31 =  pD[1+ps*3];
+	v32 =  pD[2+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		v10 += pD[1+ps*kk]*pD[0+ps*kk];
+		v20 += pD[2+ps*kk]*pD[0+ps*kk];
+		v30 += pD[3+ps*kk]*pD[0+ps*kk];
+		v21 += pD[2+ps*kk]*pD[1+ps*kk];
+		v31 += pD[3+ps*kk]*pD[1+ps*kk];
+		v32 += pD[3+ps*kk]*pD[2+ps*kk];
+		}
+	pT[0+ps*0] = - dD[0];
+	pT[1+ps*1] = - dD[1];
+	pT[2+ps*2] = - dD[2];
+	pT[3+ps*3] = - dD[3];
+	pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+	pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+	pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+	pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+	pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+	return;
+	}
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	return;
+	}
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
+
+
+
+
diff --git a/kernel/avx/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..91d1cc0
--- /dev/null
+++ b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1434 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[1] = idamax+1;
+	if(tmp0!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		inv = _mm_loaddup_pd( &pA[1+bs*1] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[1], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[1] = 0.0;
+		}
+
+
+	// third column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[2] = idamax+2;
+	if(tmp0!=0)
+		{
+		if(ipiv[2]!=2)
+			drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		inv = _mm_loaddup_pd( &pA[2+bs*2] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[2], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[2] = 0.0;
+		}
+
+
+	// fourth column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[3] = idamax+3;
+	if(tmp0!=0)
+		{
+		if(ipiv[3]!=3)
+			drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		inv = _mm_loaddup_pd( &pA[3+bs*3] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[3], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[3] = 0.0;
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
+	
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*0], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>1)
+		{
+		ipiv[1] = idamax+1;
+		if(tmp0!=0)
+			{
+			if(ipiv[1]!=1)
+				drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+			inv = _mm_loaddup_pd( &pA[1+bs*1] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[1], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[1] = 0.0;
+			}
+		}
+
+	if(n==2)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*1], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// third column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>2)
+		{
+		ipiv[2] = idamax+2;
+		if(tmp0!=0)
+			{
+			if(ipiv[2]!=2)
+				drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			inv = _mm_loaddup_pd( &pA[2+bs*2] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[2], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n==3)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*2], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// fourth column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>3)
+		{
+		ipiv[3] = idamax+3;
+		if(tmp0!=0)
+			{
+			if(ipiv[3]!=3)
+				drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			inv = _mm_loaddup_pd( &pA[3+bs*3] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[3], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[3] = 0.0;
+			}
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
diff --git a/kernel/avx/kernel_dsymv_6_lib4.S b/kernel/avx/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..b55690a
--- /dev/null
+++ b/kernel/avx/kernel_dsymv_6_lib4.S
@@ -0,0 +1,1031 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+	vmovupd	0(%r14), %ymm13
+
+	vmovapd	0(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	64(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovapd	96(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	128(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmulpd	%ymm14, %ymm10, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovapd	160(%r11), %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	vmulpd	%ymm14, %ymm11, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+	vmaskmovpd	0(%r14), %ymm14, %ymm13
+
+	vmovupd	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmaskmovpd	-32(%rsp), %ymm14
+	vmaskmovpd	0(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm14, %ymm6, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	32(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmulpd	%ymm14, %ymm7, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	64(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	vmulpd	%ymm14, %ymm8, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	96(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vmulpd	%ymm14, %ymm9, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+		
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	128(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm4, %ymm15, %ymm4
+	vmulpd	%ymm14, %ymm10, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	160(%r11), %ymm14, %ymm14
+	vmulpd	%ymm14, %ymm12, %ymm15
+	vaddpd	%ymm5, %ymm15, %ymm5
+	vmulpd	%ymm14, %ymm11, %ymm15
+	vaddpd	%ymm13, %ymm15, %ymm13
+	
+	vmovupd	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovpd	%ymm13, %ymm14, 0(%r14)
+
+	sall	$3, %r10d
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+	vmovupd		0(%r13), %ymm12
+	vmovupd		0(%r14), %ymm13
+
+	vmovapd		0(%r11), %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		32(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		64(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmovapd		96(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	subq	$4, %r10
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd			%ymm1, %ymm0, %ymm0
+	vhaddpd			%ymm3, %ymm2, %ymm2
+	vhaddpd			%ymm5, %ymm4, %ymm4
+//	vhaddpd			%ymm3, %ymm2, %ymm2
+	vperm2f128		$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm2, %ymm0
+	vextractf128	$0x1, %ymm4, %xmm5
+	vaddpd			%ymm0, %ymm1, %ymm0
+	vaddpd			%ymm4, %ymm5, %ymm4
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm0
+	vmulpd			%ymm4, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd			0(%r12), %ymm14
+	vmovupd			32(%r12), %ymm13
+	vmulpd			%ymm15, %ymm14, %ymm14
+	vaddpd			%ymm0, %ymm14, %ymm0
+	vmulpd			%ymm15, %ymm13, %ymm13
+	vaddpd			%ymm1, %ymm13, %ymm1
+	
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vmovupd		0(%r11), %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %xmm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+//                             rdi    rsi              rdx              rcx        r8       r9           rsp+8        rsp+16          rsp+24       rsp_32       rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_6_lib4
+	.type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_6_lib4
+	.def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+	vbroadcastsd 32(%r10), %ymm10
+	vmulpd		%ymm15, %ymm10, %ymm10
+	vbroadcastsd 40(%r10), %ymm11
+	vmulpd		%ymm15, %ymm11, %ymm11
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_6_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+//                            rdi    rsi            rdx        rcx      r8           r9           rsp+8        rsp+16 
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_lib4
+	.type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_lib4
+	.def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx/kernel_sgead_lib8.S b/kernel/avx/kernel_sgead_lib8.S
new file mode 100644
index 0000000..4cafa0a
--- /dev/null
+++ b/kernel/avx/kernel_sgead_lib8.S
@@ -0,0 +1,3096 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_0_lib8, @function
+inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r12
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		64(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%r13), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_0_gen_lib8, @function
+inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovups		0(%r12), %ymm0
+	vmaskmovps	0(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovups		32(%r12), %ymm0
+	vmaskmovps	32(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r12
+
+	vmovups		-64(%r12), %ymm0
+	vmaskmovps	64(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovups		-32(%r12), %ymm0
+	vmaskmovps	-32(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovups		0(%r12), %ymm0
+	vmaskmovps	0(%r13), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_1_lib8, @function
+inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+#if 1
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+#else
+	vmovups		4(%r12), %ymm0
+	vmovups		-28(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovups		36(%r12), %ymm0
+	vmovups		4(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovups		-60(%r12), %ymm0
+	vmovups		-92(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovups		-28(%r12), %ymm0
+	vmovups		-60(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+#endif
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_1_lib8, .-inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_1_gen_lib8, @function
+inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_1_gen_lib8, .-inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_2_lib8, @function
+inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_2_lib8, .-inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_2_gen_lib8, @function
+inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_2_gen_lib8, .-inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_3_lib8, @function
+inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_3_lib8, .-inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_3_gen_lib8, @function
+inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_3_gen_lib8, .-inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_4_lib8, @function
+inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		48(%r12), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+
+	vmovaps		-48(%r12), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %rax
+
+	vmovaps		-16(%r12), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		96(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 96(%r14)
+	addq		$128, %r14
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_4_lib8, .-inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_4_gen_lib8, @function
+inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		48(%r12), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+
+	vmovaps		-48(%r12), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %rax
+
+	vmovaps		-16(%r12), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	96(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 96(%r14)
+	addq		$128, %r14
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r12), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_4_gen_lib8, .-inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_5_lib8, @function
+inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_5_lib8, .-inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_5_gen_lib8, @function
+inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_5_gen_lib8, .-inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_6_lib8, @function
+inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_6_lib8, .-inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_6_gen_lib8, @function
+inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_6_gen_lib8, .-inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_7_lib8, @function
+inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		64(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		-32(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		0(%r14), %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_7_lib8, .-inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r12    <- A
+// r13d   <- 8*sda*sizeof(float)
+// r14    <- B
+// r15d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgead_8_7_gen_lib8, @function
+inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_gen_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm14
+
+	// compute mask for rows
+	vcvtsi2ss	%r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r12, %rax // A1 <- A0
+	addq	%r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r14)
+	addq		$128, %r12
+	addq		$128, %rax
+
+	vmovaps		-64(%r12), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	64(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r14)
+	addq		$128, %r14
+
+	vmovaps		-32(%r12), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	-32(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r14)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	0(%r14), %ymm15, %ymm13
+	vmulps		%ymm14, %ymm0, %ymm0
+	vaddps		%ymm13, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r14)
+	subl		$1, %r10d
+	addq		$32, %r12
+	addq		$32, %rax
+	addq		$32, %r14
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgead_8_7_gen_lib8, .-inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2             3         4
+// void kernel_sgead_8_0_lib8(int k, float *alpha, float *A, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_0_lib8
+	.type kernel_sgead_8_0_lib8, @function
+kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_0_lib8
+_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_0_lib8
+	.def kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_0_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_0_lib8, .-kernel_sgead_8_0_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx       rcx
+// void kernel_sgead_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_0_gen_lib8
+	.type kernel_sgead_8_0_gen_lib8, @function
+kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_0_gen_lib8
+_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_0_gen_lib8
+	.def kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_0_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_0_gen_lib8, .-kernel_sgead_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_1_lib8
+	.type kernel_sgead_8_1_lib8, @function
+kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_1_lib8
+_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_1_lib8
+	.def kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_1_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_1_lib8, .-kernel_sgead_8_1_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_1_gen_lib8
+	.type kernel_sgead_8_1_gen_lib8, @function
+kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_1_gen_lib8
+_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_1_gen_lib8
+	.def kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_1_gen_lib8, .-kernel_sgead_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_2_lib8
+	.type kernel_sgead_8_2_lib8, @function
+kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_2_lib8
+_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_2_lib8
+	.def kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_2_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_2_lib8, .-kernel_sgead_8_2_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_2_gen_lib8
+	.type kernel_sgead_8_2_gen_lib8, @function
+kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_2_gen_lib8
+_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_2_gen_lib8
+	.def kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_2_gen_lib8, .-kernel_sgead_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_3_lib8
+	.type kernel_sgead_8_3_lib8, @function
+kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_3_lib8
+_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_3_lib8
+	.def kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_3_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_3_lib8, .-kernel_sgead_8_3_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_3_gen_lib8
+	.type kernel_sgead_8_3_gen_lib8, @function
+kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_3_gen_lib8
+_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_3_gen_lib8
+	.def kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_3_gen_lib8, .-kernel_sgead_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_4_lib8
+	.type kernel_sgead_8_4_lib8, @function
+kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_4_lib8
+_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_4_lib8
+	.def kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_4_lib8, .-kernel_sgead_8_4_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_4_gen_lib8
+	.type kernel_sgead_8_4_gen_lib8, @function
+kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_4_gen_lib8
+_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_4_gen_lib8
+	.def kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_4_gen_lib8, .-kernel_sgead_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_5_lib8
+	.type kernel_sgead_8_5_lib8, @function
+kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_5_lib8
+_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_5_lib8
+	.def kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_5_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_5_lib8, .-kernel_sgead_8_5_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_5_gen_lib8
+	.type kernel_sgead_8_5_gen_lib8, @function
+kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_5_gen_lib8
+_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_5_gen_lib8
+	.def kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_5_gen_lib8, .-kernel_sgead_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_6_lib8
+	.type kernel_sgead_8_6_lib8, @function
+kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_6_lib8
+_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_6_lib8
+	.def kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_6_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_6_lib8, .-kernel_sgead_8_6_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_6_gen_lib8
+	.type kernel_sgead_8_6_gen_lib8, @function
+kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_6_gen_lib8
+_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_6_gen_lib8
+	.def kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_6_gen_lib8, .-kernel_sgead_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgead_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_7_lib8
+	.type kernel_sgead_8_7_lib8, @function
+kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_7_lib8
+_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_7_lib8
+	.def kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_7_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_7_lib8, .-kernel_sgead_8_7_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgead_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgead_8_7_gen_lib8
+	.type kernel_sgead_8_7_gen_lib8, @function
+kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgead_8_7_gen_lib8
+_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgead_8_7_gen_lib8
+	.def kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r13  // 8*sda*sizeof(float)
+	sall	$5, %r13d
+	movq	ARG5, %r14  // B
+	movq	ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgead_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgead_8_7_gen_lib8, .-kernel_sgead_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgecp_lib8.S b/kernel/avx/kernel_sgecp_lib8.S
new file mode 100644
index 0000000..5cd2c00
--- /dev/null
+++ b/kernel/avx/kernel_sgecp_lib8.S
@@ -0,0 +1,2796 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_0_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_0_lib8, @function
+inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_lib8:
+#endif
+#endif
+	
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		%ymm0, 32(%r12)
+	addq		$128, %r11
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		%ymm0, 64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		%ymm0, -32(%r12)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- B
+// r13d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_0_gen_lib8, @function
+inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovups		0(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r12)
+	subl		$4, %r10d
+
+	vmovups		32(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r12)
+	addq		$128, %r11
+
+	vmovups		-64(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r12)
+	addq		$128, %r12
+
+	vmovups		-32(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r12)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovups		0(%r11), %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_1_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_1_lib8, @function
+inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+#if 1
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+#else
+	vmovups		4(%r11), %ymm0
+	vmovups		-28(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovups		36(%r11), %ymm0
+	vmovups		4(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovups		-60(%r11), %ymm0
+	vmovups		-92(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovups		-28(%r11), %ymm0
+	vmovups		-60(%rax), %ymm1
+	vblendps	$0x80, %ymm1, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+#endif
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_1_gen_lib8, @function
+inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x01, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x77, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_2_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_2_lib8, @function
+inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_2_gen_lib8, @function
+inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x03, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x33, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_3_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_3_lib8, @function
+inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_3_gen_lib8, @function
+inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x07, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x11, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_4_lib8, @function
+inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		48(%r11), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+
+	vmovaps		-48(%r11), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %rax
+
+	vmovaps		-16(%r11), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 96(%r13)
+	addq		$128, %r13
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_4_gen_lib8, @function
+inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		48(%r11), %xmm0
+	vmovaps		32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+
+	vmovaps		-48(%r11), %xmm0
+	vmovaps		64(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %rax
+
+	vmovaps		-16(%r11), %xmm0
+	vmovaps		-32(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 96(%r13)
+	addq		$128, %r13
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		16(%r11), %xmm0
+	vmovaps		0(%rax), %xmm1
+	vinsertf128	$0x01, %xmm1, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_5_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_5_lib8, @function
+inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_5_gen_lib8, @function
+inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x1f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x39, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0x88, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_6_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_6_lib8, @function
+inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_6_gen_lib8, @function
+inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x3f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x4e, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xcc, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_7_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_7_lib8, @function
+inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x03, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmovaps		%ymm0, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12d   <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgecp_8_7_gen_lib8, @function
+inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	movq	%r11, %rax // A1 <- A0
+	addq	%r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$4, %r10d
+
+	vmovaps		32(%r11), %ymm0
+	vmovaps		32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 32(%r13)
+	addq		$128, %r11
+	addq		$128, %rax
+
+	vmovaps		-64(%r11), %ymm0
+	vmovaps		-64(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 64(%r13)
+	addq		$128, %r13
+
+	vmovaps		-32(%r11), %ymm0
+	vmovaps		-32(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, -32(%r13)
+
+	cmpl		$3, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		0(%rax), %ymm1
+	vblendps	$0x7f, %ymm1, %ymm0, %ymm0
+	vpermilps	$0x93, %ymm0, %ymm0
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm1
+	vblendps	$0xee, %ymm0, %ymm1, %ymm0
+	vmaskmovps	%ymm0, %ymm15, 0(%r13)
+	subl		$1, %r10d
+	addq		$32, %r11
+	addq		$32, %rax
+	addq		$32, %r13
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx
+// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_0_lib8
+	.type kernel_sgecp_8_0_lib8, @function
+kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_0_lib8
+_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_0_lib8
+	.def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_0_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx       rcx
+// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_0_gen_lib8
+	.type kernel_sgecp_8_0_gen_lib8, @function
+kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_0_gen_lib8
+_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_0_gen_lib8
+	.def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_0_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_1_lib8
+	.type kernel_sgecp_8_1_lib8, @function
+kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_1_lib8
+_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_1_lib8
+	.def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_1_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_1_gen_lib8
+	.type kernel_sgecp_8_1_gen_lib8, @function
+kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_1_gen_lib8
+_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_1_gen_lib8
+	.def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_2_lib8
+	.type kernel_sgecp_8_2_lib8, @function
+kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_2_lib8
+_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_2_lib8
+	.def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_2_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_2_gen_lib8
+	.type kernel_sgecp_8_2_gen_lib8, @function
+kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_2_gen_lib8
+_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_2_gen_lib8
+	.def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_3_lib8
+	.type kernel_sgecp_8_3_lib8, @function
+kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_3_lib8
+_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_3_lib8
+	.def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_3_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_3_gen_lib8
+	.type kernel_sgecp_8_3_gen_lib8, @function
+kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_3_gen_lib8
+_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_3_gen_lib8
+	.def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_4_lib8
+	.type kernel_sgecp_8_4_lib8, @function
+kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_4_lib8
+_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_4_lib8
+	.def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_4_gen_lib8
+	.type kernel_sgecp_8_4_gen_lib8, @function
+kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_4_gen_lib8
+_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_4_gen_lib8
+	.def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_5_lib8
+	.type kernel_sgecp_8_5_lib8, @function
+kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_5_lib8
+_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_5_lib8
+	.def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_5_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_5_gen_lib8
+	.type kernel_sgecp_8_5_gen_lib8, @function
+kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_5_gen_lib8
+_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_5_gen_lib8
+	.def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_6_lib8
+	.type kernel_sgecp_8_6_lib8, @function
+kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_6_lib8
+_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_6_lib8
+	.def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_6_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_6_gen_lib8
+	.type kernel_sgecp_8_6_gen_lib8, @function
+kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_6_gen_lib8
+_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_6_gen_lib8
+	.def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi       rdx      rcx
+// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_7_lib8
+	.type kernel_sgecp_8_7_lib8, @function
+kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_7_lib8
+_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_7_lib8
+	.def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_7_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi       rdx      rcx       r8
+// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgecp_8_7_gen_lib8
+	.type kernel_sgecp_8_7_gen_lib8, @function
+kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgecp_8_7_gen_lib8
+_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgecp_8_7_gen_lib8
+	.def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // 8*sda*sizeof(float)
+	sall	$5, %r12d
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgecp_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_16x4_lib8.S b/kernel/avx/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..5c2d6c4
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,7057 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+//  8 A0
+//  9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+	
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vbroadcastf128	0(%r13), %ymm12 // B
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r11), %ymm8 // A0
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r15), %ymm9 // A1
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+//	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r13
+	addq	$32, %r15
+
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+//  8 A0
+//  9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+	
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			0(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	subl	$4, %r10d
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			32(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmulps			%ymm10, %ymm14, %ymm15
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vmovaps			64(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmulps			%ymm8, %ymm14, %ymm15
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	addq	$128, %r13
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r11), %ymm10 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	addq	$128, %r11
+	vmulps			%ymm8, %ymm14, %ymm15
+	vmovaps			96(%r15), %ymm11 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	addq	$128, %r15
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vbroadcastf128	0(%r13), %ymm12 // B
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r11), %ymm8 // A0
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vmulps			%ymm10, %ymm14, %ymm15
+//	vmovaps			0(%r15), %ymm9 // A1
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm14, %ymm15
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	vmulps			%ymm10, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm14, %ymm15
+//	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vsubps			%ymm15, %ymm7, %ymm7
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm4, %ymm4
+
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm5, %ymm5
+
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r13
+	addq	$32, %r15
+
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vmulps			%ymm8, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm9, %ymm14, %ymm15
+	vsubps			%ymm15, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+
+	cmpl	$8, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r13, %r14, 1) // software prefetch
+	prefetcht0	64(%r13, %r14, 1) // software prefetch
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	-4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	%r14, %r13
+
+	cmpl	$8, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$7, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vmulps			%ymm13, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm14, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	124(%r13), %ymm12 // B
+	vmulps			%ymm10, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm11, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+	addq	%r14, %r13
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm14 // B[0]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%r13), %ymm14 // B[1]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	64(%r13), %ymm14 // B[2]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm14 // B[3]
+	vmulps			%ymm12, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vmulps			%ymm13, %ymm14, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r15d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r15d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm0, %ymm0
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm4, %ymm4
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm1, %ymm1
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm5, %ymm5
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm2, %ymm2
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm6, %ymm6
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vmulps			%ymm12, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm3, %ymm3
+	vmulps			%ymm13, %ymm15, %ymm14
+	vaddps			%ymm14, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // end-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r15d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r13, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r15d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r15d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movl		$0, %r15d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r15d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	64(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r15d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	68(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vmulps			%ymm9, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vmulps			%ymm6, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vmulps		%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vmulps		%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vmulps		%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+	vmulps		%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vextractf128	$0x1, %ymm0, %xmm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm1, %xmm13
+	vpermilps		$0x55, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vpermilps		$0xaa, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilps		$0xff, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	movq	%r13, %rax // C1 <- C0
+	addq	%r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	vmovaps		0(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm4, %ymm14, %ymm4
+	vmovaps		32(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm5, %ymm14, %ymm5
+	vmovaps		64(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm6, %ymm14, %ymm6
+	vmovaps		96(%rax), %ymm14
+	vmulps		%ymm14, %ymm15, %ymm14
+	vaddps		%ymm7, %ymm14, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // C1 <- C0
+	addq	%r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // C1 <- C0
+	addq	%r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	vmovaps		0(%rax), %ymm14
+	vaddps		%ymm4, %ymm14, %ymm4
+	vmovaps		32(%rax), %ymm14
+	vaddps		%ymm5, %ymm14, %ymm5
+	vmovaps		64(%rax), %ymm14
+	vaddps		%ymm6, %ymm14, %ymm6
+	vmovaps		96(%rax), %ymm14
+	vaddps		%ymm7, %ymm14, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r15)
+	vmovaps 	%ymm5, 32(%r15)
+	vmovaps 	%ymm6, 64(%r15)
+	vmovaps 	%ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			7f // end
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			7f // end
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			7f // end
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+	jmp		0f
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute D1
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	cmpl		$2, %r15d
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%rbx)
+	jl			7f // end
+	cmpl		$3, %r15d
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%rbx)
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%rbx)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%rbx)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbp // D1
+	addq	%r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+	
+	vmovaps		32(%r10), %ymm12
+	vmovaps		64(%r10), %ymm13
+	vmovaps		96(%r10), %ymm14
+
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vblendps	$0x03, %ymm13, %ymm2, %ymm2
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+	
+	vmovaps		0(%r10), %ymm12
+	vmovaps		32(%r10), %ymm13
+	vmovaps		64(%r10), %ymm14
+	vmovaps		96(%r10), %ymm15
+
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vblendps	$0x1f, %ymm13, %ymm1, %ymm1
+	vblendps	$0x3f, %ymm14, %ymm2, %ymm2
+	vblendps	$0x7f, %ymm15, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		0(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmovaps		0(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                                rdi    rsi           rdx       rcx      r8        r9           rsp+8     rsp+16   rsp+24    rsp+32
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_lib8
+	.type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9           rsp+8        rsp+16    rsp+24   rsp+32       rsp+40    rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi           rdx       rcx      r8           r9        rsp+8    rsp+16       rsp+24    rsp+32   rsp+40    rsp+48
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8            9         10       11        12       13      14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12 // km
+	movq	ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9        rsp+8    rsp+16       rsp+24    rsp+32    rsp+40   rsp+48    rsp+56    rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_lib8
+	.type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_lib8
+	.type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                       rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32 
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                          rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32             rsp+40  rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12 // m1 
+	movq	ARG12, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                             1       2          3         4          5       6          7         8          9         10       11        12       13        14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                                1       2          3         4          5       6          7         8          9         10       11        12       13        14                 15      16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8         9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4        5            6         7        8         9        10      11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                       1      2             3         4        5            6         7        8            9         10       11      12      13      14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdd*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x4_lib8.S b/kernel/avx/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..d319a83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,6673 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	128(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			32(%r11), %ymm13 // A
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+//	vbroadcastf128	128(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	32(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	128(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			32(%r11), %ymm13 // A
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+//	vbroadcastf128	128(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	32(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vsubps			%ymm15, %ymm3, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %r15d
+	subl			%r14d, %r15d // 8-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r14d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r12, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r14d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r14d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movl		$0, %r14d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r14d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r14d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm4
+	vmulps		%ymm1, %ymm15, %ymm5
+	vmulps		%ymm2, %ymm15, %ymm6
+	vmulps		%ymm3, %ymm15, %ymm7
+
+	// transpose
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm0
+	vblendps	$0xaa, %ymm5, %ymm5, %ymm1
+	vblendps	$0xaa, %ymm6, %ymm7, %ymm2
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm3
+
+	vunpcklps	%ymm1, %ymm0, %ymm4
+	vunpckhps	%ymm1, %ymm0, %ymm5
+	vunpcklps	%ymm3, %ymm2, %ymm6
+	vunpckhps	%ymm3, %ymm2, %ymm7
+
+	vunpcklpd	%ymm5, %ymm7, %ymm2
+	vunpckhpd	%ymm5, %ymm7, %ymm3
+	vunpcklpd	%ymm6, %ymm4, %ymm0
+	vunpckhpd	%ymm6, %ymm4, %ymm1
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm0, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm1, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm2, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm3, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm4, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm5, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm6, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm7, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm4
+	vmulps		%ymm1, %ymm15, %ymm5
+	vmulps		%ymm2, %ymm15, %ymm6
+	vmulps		%ymm3, %ymm15, %ymm7
+
+	// transpose
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm0
+	vblendps	$0xaa, %ymm5, %ymm5, %ymm1
+	vblendps	$0xaa, %ymm6, %ymm7, %ymm2
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm3
+
+	vunpcklps	%ymm1, %ymm0, %ymm4
+	vunpckhps	%ymm1, %ymm0, %ymm5
+	vunpcklps	%ymm3, %ymm2, %ymm6
+	vunpckhps	%ymm3, %ymm2, %ymm7
+
+	vunpcklpd	%ymm5, %ymm7, %ymm2
+	vunpckhpd	%ymm5, %ymm7, %ymm3
+	vunpcklpd	%ymm6, %ymm4, %ymm0
+	vunpckhpd	%ymm6, %ymm4, %ymm1
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm0, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm1, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm2, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm3, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm4, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm5, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm6, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vmulps		%xmm15, %xmm14, %xmm15
+	vaddps		%xmm15, %xmm7, %xmm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r12d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r12d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r12d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %r15 // C0
+	addq	%r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%xmm0,  0(%r10)
+	vmovaps 	%xmm1, 32(%r10)
+	vmovaps 	%xmm2, 64(%r10)
+	vmovaps 	%xmm3, 96(%r10)
+	vmovaps 	%xmm4, 128(%r10)
+	vmovaps 	%xmm5, 160(%r10)
+	vmovaps 	%xmm6, 192(%r10)
+	vmovaps 	%xmm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm12, %ymm14
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm1, %ymm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r10)
+	je			0f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%xmm14, %xmm12, %xmm14
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm1, %xmm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm2, %xmm14, 64(%r10)
+	cmpl		$4, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm3, %xmm14, 96(%r10)
+	cmpl		$5, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm4, %xmm14, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm5, %xmm14, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm6, %xmm14, 192(%r10)
+	je			0f // end
+	vmaskmovps	%xmm7, %xmm14, 224(%r10)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm12, %xmm14, %xmm14
+	vsubps		%xmm15, %xmm12, %xmm15
+	vandps		%xmm14, %xmm15, %xmm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	vmovaps		%xmm7, %xmm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	addq		$32, %r11
+
+	cmpl	$3, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	addq		$32, %r11
+
+	cmpl	$4, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	addq		$32, %r11
+
+	cmpl	$5, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	addq		$32, %r11
+
+	cmpl	$6, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm1, %xmm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm2, %xmm15, 64(%r11)
+	cmpl		$4, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm3, %xmm15, 96(%r11)
+	cmpl		$5, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm4, %xmm15, 128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm5, %xmm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm6, %xmm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%xmm7, %xmm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	32(%r10), %ymm12
+	vmovaps 	64(%r10), %ymm13
+	vmovaps 	96(%r10), %ymm14
+
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vblendps	$0x3, %ymm13, %ymm2, %ymm2
+	vblendps	$0x7, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r10)
+	//
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG3, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx        rcx         r8         r9      rsp+8        rsp+16    rsp+24
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // km
+	movq	ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  edi    rsi       rdx       rcx       r8        r9
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4            5         6        7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+//                                     1      2             3         4            5         6        7         8       9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4            5         6        7            8         9        10      11      12      13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x8_lib8.S b/kernel/avx/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..354fa83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5514 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+//	vbroadcastf128	0(%r12), %ymm14 // B
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	16(%r12), %ymm15 // B
+	vaddps			%ymm11, %ymm7, %ymm7
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	vbroadcastf128	16(%r12), %ymm14 // B
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm4, %ymm4
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm5, %ymm5
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	32(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm12, %ymm14, %ymm11
+	vbroadcastf128	96(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm12, %ymm15, %ymm11
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vmulps			%ymm13, %ymm14, %ymm11
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vmulps			%ymm13, %ymm14, %ymm11
+//	vbroadcastf128	0(%r12), %ymm14 // B
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vmulps			%ymm13, %ymm15, %ymm11
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	vmulps			%ymm13, %ymm15, %ymm11
+//	vbroadcastf128	16(%r12), %ymm15 // B
+	vsubps			%ymm11, %ymm7, %ymm7
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm2, %ymm2
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm3, %ymm3
+
+	vbroadcastf128	16(%r12), %ymm14 // B
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm4, %ymm4
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm5, %ymm5
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm6, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vsubps			%ymm11, %ymm7, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r12, %r14 // B_next <- B
+	addq	%r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r14) // software prefetch
+	prefetcht0	64(%r14) // software prefetch
+	prefetcht0	128(%r14) // software prefetch
+	prefetcht0	192(%r14) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	132(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	164(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	196(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	228(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	136(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	168(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	200(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	232(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	140(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	172(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	204(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	236(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	144(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	176(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	208(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	240(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	148(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	180(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	212(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	244(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	152(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	184(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	216(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	248(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	156(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	188(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	220(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	252(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$8, %r10d
+	addq	$256, %r11
+
+	mov		%r14, %r12
+	addq	%r13, %r14
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r14d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm4, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm5, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm6, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vmulps			%ymm12, %ymm13, %ymm15
+	vaddps			%ymm15, %ymm7, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	16(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	20(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	24(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	28(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	48(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	52(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	56(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	60(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+	vbroadcastss	80(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	84(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	88(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	92(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vbroadcastss	112(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm4, %ymm4
+	vbroadcastss	116(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	120(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	124(%r10), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	16(%r11), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$6, %r12d
+	jl				0f // ret
+	vbroadcastss	148(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm5, %ymm5
+	vbroadcastss	152(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	156(%r10), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	20(%r11), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$7, %r12d
+	jl				0f // ret
+	vbroadcastss	184(%r10), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm6, %ymm6
+	vbroadcastss	188(%r10), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	24(%r11), %ymm13
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$8, %r12d
+	jl				0f // ret
+	vbroadcastss	220(%r10), %ymm13
+	vmulps			%ymm6, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm7, %ymm7
+
+	vbroadcastss	28(%r11), %ymm13
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+	vperm2f128	$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+	vperm2f128	$0x11, %ymm3, %ymm3, %ymm11
+	vpermilps	$0x00, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm4, %ymm4
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm4, %xmm13
+//	vpermilps	$0x00, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			9f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+10:
+	vmovsd		%xmm13, 16(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm4
+	cmpl		$6, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm4, %ymm4, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm5, %ymm5
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm4, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm5, %xmm13
+	vpermilps	$0x55, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			11f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+12:
+	vmovsd		%xmm13, 20(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm5
+	cmpl		$7, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm5, %ymm5, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm6, %ymm6
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm5, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm6, %xmm13
+	vpermilps	$0xaa, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			13f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+14:
+	vmovsd		%xmm13, 24(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm6, %ymm13, %ymm6
+	cmpl		$8, %r11d
+	jl			0f // ret
+	vperm2f128	$0x11, %ymm6, %ymm6, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm6, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm7, %ymm7
+
+
+	vextractf128	$0x1, %ymm7, %xmm13
+	vpermilps	$0xff, %xmm13, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			15f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+16:
+	vmovsd		%xmm13, 28(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm7, %ymm13, %ymm7
+
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+9:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		10b
+
+11:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		12b
+
+13:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		14b
+
+15:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		16b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r10), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r10), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r10), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r10), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r11), %ymm12
+	vaddps		%ymm4, %ymm12, %ymm4
+	vmovaps		160(%r11), %ymm12
+	vaddps		%ymm5, %ymm12, %ymm5
+	vmovaps		192(%r11), %ymm12
+	vaddps		%ymm6, %ymm12, %ymm6
+	vmovaps		224(%r11), %ymm12
+	vaddps		%ymm7, %ymm12, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps		32(%r10), %ymm14
+	vblendps	$0x01, %ymm14, %ymm1, %ymm1
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps		64(%r10), %ymm14
+	vblendps	$0x03, %ymm14, %ymm2, %ymm2
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		96(%r10), %ymm14
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps		128(%r10), %ymm14
+	vblendps	$0x0f, %ymm14, %ymm4, %ymm4
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps		160(%r10), %ymm14
+	vblendps	$0x1f, %ymm14, %ymm5, %ymm5
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps		192(%r10), %ymm14
+	vblendps	$0x3f, %ymm14, %ymm6, %ymm6
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps		224(%r10), %ymm14
+	vblendps	$0x7f, %ymm14, %ymm7, %ymm7
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmovaps 	128(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmovaps 	160(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmovaps 	192(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmovaps 	224(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmovaps 	128(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmovaps 	160(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmovaps 	192(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmovaps 	224(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4         5            6         7         8       9
+// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx        rcx         r8         r9      rsp+8        rsp+16    rsp+24
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movl	$8, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // m1 
+	movq	ARG9, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	$8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                  edi    rsi       rdx       rcx       r8        r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$8, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_diag_lib8.c b/kernel/avx/kernel_sgemm_diag_lib8.c
new file mode 100644
index 0000000..63183b2
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_diag_lib8.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	b_33 = _mm256_broadcast_ss( &B[3] );
+	b_33 = _mm256_mul_ps( b_33, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+		_mm256_store_ps( &D[24], d_03 );
+
+		A += 8*sda;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+		_mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22, b_33,
+		c_00,
+		d_00, d_01, d_02, d_03;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	b_33 = _mm256_broadcast_ss( &B[3] );
+	b_33 = _mm256_mul_ps( b_33, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+		c_00 = _mm256_load_ps( &C[24] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_03 = _mm256_add_ps( c_00, d_03 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+		_mm256_store_ps( &D[24], d_03 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+		a_00 = _mm256_load_ps( &A[24] );
+		d_03 = _mm256_mul_ps( a_00, b_33 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+		c_00 = _mm256_load_ps( &C[24] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_03 = _mm256_add_ps( c_00, d_03 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+		_mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 8;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11, b_22,
+		c_00,
+		d_00, d_01, d_02;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	b_22 = _mm256_broadcast_ss( &B[2] );
+	b_22 = _mm256_mul_ps( b_22, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+		_mm256_store_ps( &D[16], d_02 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+		a_00 = _mm256_load_ps( &A[16] );
+		d_02 = _mm256_mul_ps( a_00, b_22 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+		c_00 = _mm256_load_ps( &C[16] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_02 = _mm256_add_ps( c_00, d_02 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+		_mm256_maskstore_ps( &D[16], mask_i, d_02 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00, b_11,
+		c_00,
+		d_00, d_01;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	b_11 = _mm256_broadcast_ss( &B[1] );
+	b_11 = _mm256_mul_ps( b_11, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+
+		_mm256_store_ps( &D[0], d_00 );
+		_mm256_store_ps( &D[8], d_01 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+		a_00 = _mm256_load_ps( &A[8] );
+		d_01 = _mm256_mul_ps( a_00, b_11 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+		c_00 = _mm256_load_ps( &C[8] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_01 = _mm256_add_ps( c_00, d_01 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+		_mm256_maskstore_ps( &D[8], mask_i, d_01 );
+
+		}
+	
+	}
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	__m256
+		alpha0, beta0,
+		mask_f,
+		sign,
+		a_00,
+		b_00,
+		c_00,
+		d_00;
+	
+	__m256i
+		mask_i;
+	
+	alpha0 = _mm256_broadcast_ss( alpha );
+	beta0  = _mm256_broadcast_ss( beta );
+	
+	b_00 = _mm256_broadcast_ss( &B[0] );
+	b_00 = _mm256_mul_ps( b_00, alpha0 );
+	
+	for(k=0; k<kmax-7; k+=8)
+		{
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+
+		_mm256_store_ps( &D[0], d_00 );
+
+		A += 8*sda;
+		C += 8*sdc;
+		D += 8*sdd;
+
+		}
+	if(k<kmax)
+		{
+
+		const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+		float m_f = kmax-k;
+
+		mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+		a_00 = _mm256_load_ps( &A[0] );
+		d_00 = _mm256_mul_ps( a_00, b_00 );
+
+		c_00 = _mm256_load_ps( &C[0] );
+		c_00 = _mm256_mul_ps( c_00, beta0 );
+		d_00 = _mm256_add_ps( c_00, d_00 );
+
+		_mm256_maskstore_ps( &D[0], mask_i, d_00 );
+
+		}
+	
+	}
+
+
+
+
diff --git a/kernel/avx/kernel_sgemv_4_lib8.S b/kernel/avx/kernel_sgemv_4_lib8.S
new file mode 100644
index 0000000..1508ebe
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_4_lib8.S
@@ -0,0 +1,2935 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_t_4_lib8, @function
+inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups		0(%r13), %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	subl	$8, %r10d
+
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmaskmovps	0(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmaskmovps	32(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmaskmovps	64(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmaskmovps	96(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+		
+	sall	$2, %r10d // *sizeof(float)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_t_4_lib8, .-inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_nt_4_lib8, @function
+inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_nt_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_nt_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups	0(%r13), %ymm12
+	vmovups	0(%r14), %ymm13
+
+	vmovaps	0(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm0, %ymm15, %ymm0
+	vmulps	%ymm14, %ymm6, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	subl	$8, %r10d
+
+	vmovaps	32(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm1, %ymm15, %ymm1
+	vmulps	%ymm14, %ymm7, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	vmovaps	64(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm2, %ymm15, %ymm2
+	vmulps	%ymm14, %ymm8, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+
+	vmovaps	96(%r11), %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm3, %ymm15, %ymm3
+	vmulps	%ymm14, %ymm9, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+	vmovups	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm11
+
+	vmaskmovps	0(%r13), %ymm11, %ymm12
+	vmaskmovps	0(%r14), %ymm11, %ymm13
+
+//	vmovups	%ymm14, -32(%rsp) // spill mask to stack
+
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	0(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm0, %ymm15, %ymm0
+	vmulps	%ymm14, %ymm6, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	32(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm1, %ymm15, %ymm1
+	vmulps	%ymm14, %ymm7, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+	
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	64(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm2, %ymm15, %ymm2
+	vmulps	%ymm14, %ymm8, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	96(%r11), %ymm11, %ymm14
+	vmulps	%ymm14, %ymm12, %ymm15
+	vaddps	%ymm3, %ymm15, %ymm3
+	vmulps	%ymm14, %ymm9, %ymm15
+	vaddps	%ymm13, %ymm15, %ymm13
+		
+//	vmovups	-32(%rsp), %ymm14 // load mask form stack
+	vmaskmovps	%ymm13, %ymm11, 0(%r14)
+
+	sall	$2, %r10d // *sizeof(float)
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_nt_4_lib8, .-inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemv_add_t_4_lib8, @function
+inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_4_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$2, %r15d // offA*sizeof(float)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2ss	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2ss	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm13, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+	vandps		%ymm15, %ymm14, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$8, %r10d // kmax - (8-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemv_add_t_4_lib8, .-inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	cmpl	$2, %r13d
+	jl		1f
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	cmpl	$3, %r13d
+	jl		2f
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	cmpl	$4, %r13d
+	jl		3f
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	cmpl	$5, %r13d
+	jl		4f
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	cmpl	$6, %r13d
+	jl		5f
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	cmpl	$7, %r13d
+	jl		6f
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	cmpl	$8, %r13d
+	jl		7f
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+	jmp		0f
+
+
+
+	vmovaps			%ymm14, %ymm12
+1:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+2:
+	vmovaps			%ymm14, %ymm12
+3:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	jmp		8f
+
+4:
+	vmovaps			%xmm14, %xmm12
+5:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+6:
+	vmovaps			%xmm14, %xmm12
+7:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+8:
+	
+	vmovaps			%xmm14, %xmm11
+	vmovaps			%xmm14, %xmm12
+	vmovaps			%xmm14, %xmm13
+
+0:
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	cmpl	$8, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$7, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$6, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$5, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$4, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$3, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$2, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$1, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+0:
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// r15   <- offA
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_symv_add_nt_4l_lib8, @function
+inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_symv_add_nt_4l_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4l_lib8:
+#endif
+#endif
+
+	movl	$8, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2ss	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm11
+
+	vmaskmovps	0(%r13), %ymm11, %ymm12
+	vmaskmovps	0(%r14), %ymm11, %ymm13
+
+	vmaskmovps	0(%r11), %ymm11, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x01, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm6, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	32(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x01, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x03, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm7, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	64(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x03, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x07, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm8, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+
+	vmaskmovps	96(%r11), %ymm11, %ymm14
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x07, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vxorps		%ymm15, %ymm15, %ymm15
+	vblendps	$0x0f, %ymm15, %ymm14, %ymm14
+	vmulps		%ymm14, %ymm9, %ymm15
+	vaddps		%ymm13, %ymm15, %ymm13
+	
+	vmaskmovps	%ymm13, %ymm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$2, %rax // *sizeof(float)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_symv_add_nt_4l_lib8, .-inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+
+
+
+
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_symv_add_nt_4r_lib8, @function
+inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_symv_add_nt_4r_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4r_lib8:
+#endif
+#endif
+
+	movl	$4, %eax
+	cmpl	%eax, %r10d
+	jge		0f
+	movl	%r10d, %eax
+0:
+	subl	%r15d, %eax
+
+	vcvtsi2ss	%eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm13
+#endif
+	vshufps		$0x0, %xmm14, %xmm14, %xmm14
+//	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%xmm14, %xmm13, %xmm11
+
+	vmaskmovps	0(%r13), %xmm11, %xmm12
+	vmaskmovps	0(%r14), %xmm11, %xmm13
+
+	vmaskmovps	0(%r11), %xmm11, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm0, %xmm15, %xmm0
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x01, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm6, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	32(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x01, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm1, %xmm15, %xmm1
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x03, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm7, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	64(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x03, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm2, %xmm15, %xmm2
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x07, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm8, %xmm15
+	vaddps		%xmm13, %xmm15, %xmm13
+
+	vmaskmovps	96(%r11), %xmm11, %xmm14
+	vxorps		%xmm15, %xmm15, %xmm15
+	vblendps	$0x07, %xmm15, %xmm14, %xmm14
+	vmulps		%xmm14, %xmm12, %xmm15
+	vaddps		%xmm3, %xmm15, %xmm3
+//	vxorps		%xmm15, %xmm15, %xmm15
+//	vblendps	$0x0f, %xmm15, %xmm14, %xmm14
+//	vmulps		%xmm14, %xmm9, %xmm15
+//	vaddps		%xmm13, %xmm15, %xmm13
+	
+	vmaskmovps	%xmm13, %xmm11, 0(%r14)
+
+	subl	%eax, %r10d
+
+	salq	$2, %rax // *sizeof(float)
+	addq	%rax, %r11
+	subq	$32, %r11
+	addq	%r12, %r11
+	addq	%rax, %r13
+	addq	%rax, %r14
+	
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_symv_add_nt_4r_lib8, .-inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_4_lib8, @function
+inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// alpha
+	vbroadcastss	0(%r10), %xmm15
+	vmulps			%xmm0, %xmm15, %xmm0
+
+	// beta
+	vbroadcastss	0(%r11), %xmm15
+	vmovups			0(%r12), %xmm14
+	vmulps			%xmm15, %xmm14, %xmm14
+	vaddps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_4_lib8, .-inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib8, @function
+inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// alpha
+	vbroadcastss	0(%r10), %xmm15
+	vmulps			%xmm0, %xmm15, %xmm0
+
+	// beta
+	vmovups			0(%r11), %xmm14
+	vaddps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib8, .-inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_4_lib8, @function
+inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vaddps			%xmm0, %xmm1, %xmm0
+
+	// beta
+	vmovups			0(%r10), %xmm14
+	vsubps			%xmm0, %xmm14, %xmm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_4_lib8, .-inner_blend_t_scale_m11_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_lib8, @function
+inner_store_4_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_lib8; .scl 2; .type 32; .endef
+inner_store_4_lib8:
+#endif
+#endif
+	
+	vmovups %xmm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_lib8, .-inner_store_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_vs_lib8, @function
+inner_store_4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm14
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm14
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+//	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm15, %xmm14, %xmm15
+
+	vmaskmovps	%xmm0, %xmm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_vs_lib8, .-inner_store_4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4_gen_lib8, @function
+inner_store_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+//	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+//	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm12, %xmm14, %xmm14
+	vsubps		%xmm15, %xmm12, %xmm15
+	vandps		%xmm14, %xmm15, %xmm15
+
+	vmaskmovps	%xmm0, %xmm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4_gen_lib8, .-inner_store_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_sgemv_t_4_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_lib8
+	.type kernel_sgemv_t_4_lib8, @function
+kernel_sgemv_t_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_lib8
+_kernel_sgemv_t_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_lib8
+	.def kernel_sgemv_t_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_lib8, .-kernel_sgemv_t_4_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_sgemv_t_4_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_vs_lib8
+	.type kernel_sgemv_t_4_vs_lib8, @function
+kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_vs_lib8
+_kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_vs_lib8
+	.def kernel_sgemv_t_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_vs_lib8, .-kernel_sgemv_t_4_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_sgemv_t_4_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_4_gen_lib8
+	.type kernel_sgemv_t_4_gen_lib8, @function
+kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_4_gen_lib8
+_kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_4_gen_lib8
+	.def kernel_sgemv_t_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_4_gen_lib8, .-kernel_sgemv_t_4_gen_lib8
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3        4                   5          6          7          8      9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG8, %r12 // km
+	movq	ARG9, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+#endif
+
+
+
+
+
+//                             1      2                3                4          5        6            7            8               9            10           11
+// void kernel_sgemv_nt_4_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_nt_4_lib8
+	.type kernel_sgemv_nt_4_lib8, @function
+kernel_sgemv_nt_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_nt_4_lib8
+_kernel_sgemv_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_nt_4_lib8
+	.def kernel_sgemv_nt_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_nt_4_lib8, .-kernel_sgemv_nt_4_lib8
+#endif
+
+
+
+
+
+//                                1      2                3                4          5        6            7            8               9            10           11           12
+// void kernel_sgemv_nt_4_vs_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_nt_4_vs_lib8
+	.type kernel_sgemv_nt_4_vs_lib8, @function
+kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_nt_4_vs_lib8
+_kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_nt_4_vs_lib8
+	.def kernel_sgemv_nt_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG12, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+	movq	ARG12, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_nt_4_vs_lib8, .-kernel_sgemv_nt_4_vs_lib8
+#endif
+
+
+
+
+
+//                             1      2              3          4        5          6
+// void kernel_dsymv_l_4l_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4l_lib8
+	.type kernel_ssymv_l_4l_lib8, @function
+kernel_ssymv_l_4l_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4l_lib8
+_kernel_ssymv_l_4l_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4l_lib8
+	.def kernel_ssymv_l_4l_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+	movq	$0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4l_lib8, .-kernel_ssymv_l_4l_lib8
+#endif
+
+
+
+
+
+//                             1      2              3          4        5          6
+// void kernel_dsymv_l_4r_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4r_lib8
+	.type kernel_ssymv_l_4r_lib8, @function
+kernel_ssymv_l_4r_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4r_lib8
+_kernel_ssymv_l_4r_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4r_lib8
+	.def kernel_ssymv_l_4r_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x_t
+	movq	ARG6, %r14  // z_n
+	movq	$0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4r_lib8, .-kernel_ssymv_l_4r_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5        6          7          8
+// void kernel_dsymv_l_4l_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4l_gen_lib8
+	.type kernel_ssymv_l_4l_gen_lib8, @function
+kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4l_gen_lib8
+_kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4l_gen_lib8
+	.def kernel_ssymv_l_4l_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG8, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4l_gen_lib8, .-kernel_ssymv_l_4l_gen_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5        6          7          8
+// void kernel_dsymv_l_4r_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssymv_l_4r_gen_lib8
+	.type kernel_ssymv_l_4r_gen_lib8, @function
+kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssymv_l_4r_gen_lib8
+_kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssymv_l_4r_gen_lib8
+	.def kernel_ssymv_l_4r_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastss 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+	movq	ARG8, %r11 // km
+
+	vbroadcastss 0(%r10), %ymm6
+	vmulps		%ymm15, %ymm6, %ymm6
+	cmpl	$2, %r11d
+	jl		0f
+	vbroadcastss 4(%r10), %ymm7
+	vmulps		%ymm15, %ymm7, %ymm7
+	cmpl	$3, %r11d
+	jl		0f
+	vbroadcastss 8(%r10), %ymm8
+	vmulps		%ymm15, %ymm8, %ymm8
+	je		0f
+	vbroadcastss 12(%r10), %ymm9
+	vmulps		%ymm15, %ymm9, %ymm9
+0:
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+	movq	ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z_t 
+	movq	ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssymv_l_4r_gen_lib8, .-kernel_ssymv_l_4r_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.float	0.5
+	.float	1.5
+	.float	2.5
+	.float	3.5
+	.float	4.5
+	.float	5.5
+	.float	6.5
+	.float	7.5
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_sgemv_8_lib8.S b/kernel/avx/kernel_sgemv_8_lib8.S
new file mode 100644
index 0000000..aafd8cb
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_8_lib8.S
@@ -0,0 +1,2837 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- x
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- x+k*sizeof(double)
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z0 z1 z2 z3]_b
+// ymm2  <- [z0 z1 z2 z3]_c
+// ymm3  <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_n_8_lib8, @function
+inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_n_8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm0, %ymm15, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovaps			32(%r11), %ymm8
+	vbroadcastss	4(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm1, %ymm15, %ymm1
+	
+	vmovaps			64(%r11), %ymm8
+	vbroadcastss	8(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm2, %ymm15, %ymm2
+
+	vmovaps			96(%r11), %ymm8
+	vbroadcastss	12(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm3, %ymm15, %ymm3
+	
+	addq	$128, %r11
+	addq	$16, %r12
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm0, %ymm15, %ymm0
+	
+	addq	$32, %r11
+	addq	$4, %r12
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemv_add_t_8_lib8, @function
+inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_8_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovups		0(%r13), %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	subl	$8, %r10d
+
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	
+	vmovaps		128(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	
+	vmovaps		160(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	
+	vmovaps		192(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	
+	vmovaps		224(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$7, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2ss	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmaskmovps	0(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmaskmovps	32(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmaskmovps	64(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmaskmovps	96(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+		
+	vmaskmovps	128(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+		
+	vmaskmovps	160(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+		
+	vmaskmovps	192(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+		
+	vmaskmovps	224(%r11), %ymm14, %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+		
+	sall	$2, %r10d
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 
+// r11   <- 
+// r12   <- 
+// r13   <- 
+// r14d  <- offA
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemv_add_t_8_lib8, @function
+inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_8_lib8:
+#endif
+#endif
+
+	cmpl	$0, %r14d
+	jle		0f // return
+
+	movl	%r14d, %r15d
+	sall	$2, %r15d // offA*sizeof(float)
+
+	subq	%r15, %r11 // A - offA
+	subq	%r15, %r13 // x - offA
+
+	movl	%r10d, %r15d // kmax
+	addl	%r14d, %r15d // kmax + offA
+
+	vcvtsi2ss	%r14d, %xmm14, %xmm14 // offA
+	vcvtsi2ss	%r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm13, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+	vandps		%ymm15, %ymm14, %ymm14
+
+	vmaskmovps	0(%r13), %ymm14, %ymm12
+
+	vmovaps		0(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	
+	vmovaps		32(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	
+	vmovaps		64(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+
+	vmovaps		96(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		128(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+
+	vmovaps		160(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+
+	vmovaps		192(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+
+	vmovaps		224(%r11), %ymm8
+	vmulps		%ymm8, %ymm12, %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+	addq	$32, %r13 // x + 4
+	addq	%r12, %r11 // A + bs*sda
+		
+	addl	%r14d, %r10d
+	subl	$8, %r10d // kmax - (8-offA)
+	
+0: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_ln_inv_8_lib8, @function
+inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vbroadcastss	0(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x01, %ymm1, %ymm0, %ymm0
+
+	vmovaps			0(%r10), %ymm13
+	vblendps		$0x01, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	4(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x02, %ymm1, %ymm0, %ymm0
+
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	8(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x04, %ymm1, %ymm0, %ymm0
+
+	vmovaps			64(%r10), %ymm13
+	vblendps		$0x07, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	12(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x08, %ymm1, %ymm0, %ymm0
+
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xff, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	16(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x10, %ymm1, %ymm0, %ymm0
+
+	vmovaps			128(%r10), %ymm13
+	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	20(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x20, %ymm1, %ymm0, %ymm0
+
+	vmovaps			160(%r10), %ymm13
+	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	24(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x40, %ymm1, %ymm0, %ymm0
+
+	vmovaps			192(%r10), %ymm13
+	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	28(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x80, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_ln_inv_8_vs_lib8, @function
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vbroadcastss	0(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x01, %ymm1, %ymm0, %ymm0
+	vmovaps			0(%r10), %ymm13
+	vblendps		$0x01, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$2, %r12d
+	jl				0f // ret
+
+	vbroadcastss	4(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x02, %ymm1, %ymm0, %ymm0
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$3, %r12d
+	jl				0f // ret
+
+	vbroadcastss	8(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x04, %ymm1, %ymm0, %ymm0
+	vmovaps			64(%r10), %ymm13
+	vblendps		$0x07, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$4, %r12d
+	jl				0f // ret
+
+	vbroadcastss	12(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x08, %ymm1, %ymm0, %ymm0
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xff, %ymm0, %ymm12
+	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$5, %r12d
+	jl				0f // ret
+
+	vbroadcastss	16(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x10, %ymm1, %ymm0, %ymm0
+	vmovaps			128(%r10), %ymm13
+	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x00, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$6, %r12d
+	jl				0f // ret
+
+	vbroadcastss	20(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x20, %ymm1, %ymm0, %ymm0
+	vmovaps			160(%r10), %ymm13
+	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0x55, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$7, %r12d
+	jl				0f // ret
+
+	vbroadcastss	24(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x40, %ymm1, %ymm0, %ymm0
+	vmovaps			192(%r10), %ymm13
+	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
+	vpermilps		$0xaa, %ymm0, %ymm12
+	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
+	vmulps			%ymm13, %ymm12, %ymm15
+	vsubps			%ymm15, %ymm0, %ymm0
+
+	cmpl			$8, %r12d
+	jl				0f // ret
+
+	vbroadcastss	28(%r11), %ymm12
+	vmulps			%ymm0, %ymm12, %ymm1
+	vblendps		$0x80, %ymm1, %ymm0, %ymm0
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+	
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// r14  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// r13  <- kn
+// r14  <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm13, %ymm14
+
+	vmovups		0(%r14), %ymm15
+	vblendvps	%ymm14, %ymm0, %ymm15, %ymm0
+
+
+
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vmovaps			0(%r10), %ymm12
+	vblendps		$0x01, %ymm14, %ymm12, %ymm12
+	cmpl	$2, %r13d
+	jl		1f
+	vmovaps			32(%r10), %ymm13
+	vblendps		$0x03, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+	cmpl	$3, %r13d
+	jl		2f
+	vmovaps			64(%r10), %ymm12
+	vblendps		$0x07, %ymm14, %ymm12, %ymm12
+	cmpl	$4, %r13d
+	jl		3f
+	vmovaps			96(%r10), %ymm13
+	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	cmpl	$5, %r13d
+	jl		4f
+	vmovaps			144(%r10), %xmm12
+	vblendps		$0x01, %xmm14, %xmm12, %xmm12
+	cmpl	$6, %r13d
+	jl		5f
+	vmovaps			176(%r10), %xmm13
+	vblendps		$0x03, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+	cmpl	$7, %r13d
+	jl		6f
+	vmovaps			208(%r10), %xmm12
+	vblendps		$0x07, %xmm14, %xmm12, %xmm12
+	cmpl	$8, %r13d
+	jl		7f
+	vmovaps			240(%r10), %xmm13
+	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+	jmp		0f
+
+
+
+	vmovaps			%ymm14, %ymm12
+1:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm8
+	vunpckhps		%ymm13, %ymm12, %ymm9
+
+2:
+	vmovaps			%ymm14, %ymm12
+3:
+	vmovaps			%ymm14, %ymm13
+	vunpcklps		%ymm13, %ymm12, %ymm10
+	vunpckhps		%ymm13, %ymm12, %ymm11
+
+	vshufps			$0x44, %ymm10, %ymm8, %ymm7
+	vshufps			$0xee, %ymm10, %ymm8, %ymm4
+	vshufps			$0x44, %ymm11, %ymm9, %ymm5
+	vshufps			$0xee, %ymm11, %ymm9, %ymm6
+	vextractf128	$0x1, %ymm7, %xmm7
+	vextractf128	$0x1, %ymm4, %xmm8
+	vextractf128	$0x1, %ymm5, %xmm9
+	vextractf128	$0x1, %ymm6, %xmm10
+
+	jmp		8f
+
+4:
+	vmovaps			%xmm14, %xmm12
+5:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm1
+	vunpckhps		%xmm13, %xmm12, %xmm2
+
+6:
+	vmovaps			%xmm14, %xmm12
+7:
+	vmovaps			%xmm14, %xmm13
+	vunpcklps		%xmm13, %xmm12, %xmm3
+	vunpckhps		%xmm13, %xmm12, %xmm15
+
+	vshufps			$0xee, %xmm3, %xmm1, %xmm11
+	vshufps			$0x44, %xmm15, %xmm2, %xmm12
+	vshufps			$0xee, %xmm15, %xmm2, %xmm13
+
+8:
+	
+	vmovaps			%xmm14, %xmm11
+	vmovaps			%xmm14, %xmm12
+	vmovaps			%xmm14, %xmm13
+
+0:
+	vxorps			%ymm14, %ymm14, %ymm14
+
+	vextractf128	$0x1, %ymm0, %xmm1
+
+	cmpl	$8, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm1, %xmm1, %xmm2
+	cmpl	$8, %r13d
+	jl		1f
+	vbroadcastss	28(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm10, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm13, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$7, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
+	cmpl	$7, %r13d
+	jl		1f
+	vbroadcastss	24(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm9, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm12, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$6, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm1, %xmm1, %xmm2
+	cmpl	$6, %r13d
+	jl		1f
+	vbroadcastss	20(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm8, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+	vmulps			%xmm11, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm1, %xmm1
+
+0:
+	cmpl	$5, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm1, %xmm1, %xmm2
+	cmpl	$5, %r13d
+	jl		1f
+	vbroadcastss	16(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm1, %xmm1
+1:
+	vmulps			%xmm7, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$4, %r12d
+	jl		0f
+
+	vshufps			$0xff, %xmm0, %xmm0, %xmm2
+	cmpl	$4, %r13d
+	jl		1f
+	vbroadcastss	12(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x08, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm6, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$3, %r12d
+	jl		0f
+
+	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
+	cmpl	$3, %r13d
+	jl		1f
+	vbroadcastss	8(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x04, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm5, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$2, %r12d
+	jl		0f
+
+	vshufps			$0x55, %xmm0, %xmm0, %xmm2
+	cmpl	$2, %r13d
+	jl		1f
+	vbroadcastss	4(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x02, %xmm2, %xmm0, %xmm0
+1:
+	vmulps			%xmm4, %xmm2, %xmm15
+	vsubps			%xmm15, %xmm0, %xmm0
+
+0:
+	cmpl	$1, %r12d
+	jl		0f
+
+	vshufps			$0x00, %xmm0, %xmm0, %xmm2
+	cmpl	$1, %r13d
+	jl		1f
+	vbroadcastss	0(%r11), %xmm15
+	vmulps			%xmm2, %xmm15, %xmm2
+	vblendps		$0x01, %xmm2, %xmm0, %xmm0
+1:
+
+0:
+
+	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_8_lib8, @function
+inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vaddps			%ymm0, %ymm1, %ymm0
+	vaddps			%ymm2, %ymm3, %ymm2
+	vaddps			%ymm0, %ymm2, %ymm0
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+	vmulps			%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+	vmovups			0(%r12), %ymm14
+	vmulps			%ymm15, %ymm14, %ymm14
+	vaddps			%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_m11_8_lib8, @function
+inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vaddps	%ymm0, %ymm1, %ymm0
+	vaddps	%ymm2, %ymm3, %ymm2
+	vaddps	%ymm0, %ymm2, %ymm0
+
+	// beta
+	vmovups		0(%r10), %ymm14
+	vsubps		%ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_8_lib8, @function
+inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+	vhaddps			%ymm5, %ymm4, %ymm4
+	vhaddps			%ymm7, %ymm6, %ymm6
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+	vhaddps			%ymm6, %ymm4, %ymm4
+
+	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
+
+	vaddps			%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+	vmulps			%ymm0, %ymm15, %ymm0
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+	vmovups			0(%r12), %ymm14
+	vmulps			%ymm15, %ymm14, %ymm14
+	vaddps			%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_m11_8_lib8, @function
+inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_8_lib8:
+#endif
+#endif
+
+	// reduction
+	vhaddps			%ymm1, %ymm0, %ymm0
+	vhaddps			%ymm3, %ymm2, %ymm2
+	vhaddps			%ymm5, %ymm4, %ymm4
+	vhaddps			%ymm7, %ymm6, %ymm6
+
+	vhaddps			%ymm2, %ymm0, %ymm0
+	vhaddps			%ymm6, %ymm4, %ymm4
+
+	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
+
+	vaddps			%ymm0, %ymm1, %ymm0
+
+	// beta
+	vmovups			0(%r10), %ymm14
+	vsubps			%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_lib8, @function
+inner_store_8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_lib8; .scl 2; .type 32; .endef
+inner_store_8_lib8:
+#endif
+#endif
+	
+	vmovups %ymm0,  0(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_lib8, .-inner_store_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_vs_lib8, @function
+inner_store_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8_vs_lib8:
+#endif
+#endif
+	
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm14
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm14, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d  <- k0 : start form (inc)
+// r12d  <- k1 : up to (exc)
+// ymm0  <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_gen_lib8, @function
+inner_store_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            1      2              3          4          5             6          7
+// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_lib8
+	.type kernel_sgemv_n_8_lib8, @function
+kernel_sgemv_n_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_lib8
+_kernel_sgemv_n_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_lib8
+	.def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7          8
+// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_vs_lib8
+	.type kernel_sgemv_n_8_vs_lib8, @function
+kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_vs_lib8
+_kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_vs_lib8
+	.def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3          4          5             6          7          8       9
+// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_n_8_gen_lib8
+	.type kernel_sgemv_n_8_gen_lib8, @function
+kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_n_8_gen_lib8
+_kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_n_8_gen_lib8
+	.def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11   // beta
+	movq	ARG6, %r12   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG8, %r11 // k1 
+	movq	ARG9, %r12 // k2 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
+#endif
+
+
+
+
+
+//                            1      2              3          4        5          6             7         8
+// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_lib8
+	.type kernel_sgemv_t_8_lib8, @function
+kernel_sgemv_t_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_lib8
+_kernel_sgemv_t_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_lib8
+	.def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6             7         8           9
+// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_vs_lib8
+	.type kernel_sgemv_t_8_vs_lib8, @function
+kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_vs_lib8
+_kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_vs_lib8
+	.def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+	movq	ARG9, %r11 // k1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2              3         4          5        6          7             8          9          10
+// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemv_t_8_gen_lib8
+	.type kernel_sgemv_t_8_gen_lib8, @function
+kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemv_t_8_gen_lib8
+_kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemv_t_8_gen_lib8
+	.def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner sgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG6, %r13  // x
+	movq	ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11   // beta
+	movq	ARG8, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG9, %r10 // z 
+	movq	ARG10, %r11 // km 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
+#endif
+
+
+
+
+
+//                                 1      2          3                   4          5          6
+// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_ln_inv_8_lib8
+	.type kernel_strsv_ln_inv_8_lib8, @function
+kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_ln_inv_8_lib8
+_kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_ln_inv_8_lib8
+	.def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_ln_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3                   4          5          6          7       8
+// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_ln_inv_8_vs_lib8
+	.type kernel_strsv_ln_inv_8_vs_lib8, @function
+kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_ln_inv_8_vs_lib8
+_kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_ln_inv_8_vs_lib8
+	.def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG4, %r12  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+	movq	%r11, %r13 // A+k*sizeof(double)
+
+
+	// call inner blender n
+
+	movq	ARG5, %r10   // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	%r13, %r10 // A+k*sizeof(double)
+	movq	ARG3, %r11 // inv_diag_A
+	movq	ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_ln_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // z 
+	movq	ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
+#endif
+
+
+
+
+
+//                                 1      2          3        4                   5          6          7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_lib8
+	.def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+//                                    1      2          3        4                   5          6          7          8      9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsv_lt_inv_8_vs_lib8
+	.def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	subl	$8, %r10d
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	addq	%r12, %r11 // A+8*sda*sizeof(float)
+	movq	ARG5, %r13 // x
+	addq	$32, %r13 // x+8 
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG6, %r10 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+	// solution
+
+	movq	ARG2, %r10 // A
+	movq	ARG4, %r11 // inv_diag_A
+	movq	ARG8, %r12 // km
+	movq	ARG9, %r13 // kn
+	movq	ARG5, %r14 // x
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // z 
+	movq	ARG9, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.float	0.5
+	.float	1.5
+	.float	2.5
+	.float	3.5
+	.float	4.5
+	.float	5.5
+	.float	6.5
+	.float	7.5
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgesc_lib8.S b/kernel/avx/kernel_sgesc_lib8.S
new file mode 100644
index 0000000..43ff708
--- /dev/null
+++ b/kernel/avx/kernel_sgesc_lib8.S
@@ -0,0 +1,506 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- alpha
+// r12    <- A
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGESC_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgesc_8_lib8, @function
+inner_kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_lib8:
+#endif
+#endif
+	
+	vbroadcastss	0(%r11), %ymm15
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 32(%r12)
+
+	vmovaps		64(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, -32(%r12)
+
+	cmpl		$4, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm15, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- alpha
+// r12    <- A
+// r13d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgesc_8_gen_lib8, @function
+inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vbroadcastss	0(%r11), %ymm14
+
+	cmpl	$3, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  0(%r12)
+	subl		$4, %r10d
+
+	vmovaps		32(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  32(%r12)
+
+	vmovaps		64(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  64(%r12)
+	addq		$128, %r12
+
+	vmovaps		-32(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  -32(%r12)
+
+	cmpl		$4, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean-up loop
+
+	vmovaps		0(%r12), %ymm0
+	vmulps		%ymm14, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm15,  0(%r12)
+	subl		$1, %r10d
+	addq		$32, %r12
+
+	cmpl		$0, %r10d
+	jg			3b // clean-up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+//                          rdi    rsi           rdx
+// void kernel_sgesc_8_lib8(int k, float *alpha, float *A);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgesc_8_lib8
+	.type kernel_sgesc_8_lib8, @function
+kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgesc_8_lib8
+_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgesc_8_lib8
+	.def kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGESC_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgesc_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgesc_8_lib8, .-kernel_sgesc_8_lib8
+#endif
+
+
+
+
+
+//                              rdi    rsi           rdx       rcx
+// void kernel_sgecp_8_gen_lib8(int k, float *alpha, float *A, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgesc_8_gen_lib8
+	.type kernel_sgesc_8_gen_lib8, @function
+kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgesc_8_gen_lib8
+_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgesc_8_gen_lib8
+	.def kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // alpha
+	movq	ARG3, %r12  // A
+	movq	ARG4, %r14 // m1
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgesc_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgesc_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgesc_8_gen_lib8, .-kernel_sgesc_8_gen_lib8
+#endif
+
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgetr_lib8.S b/kernel/avx/kernel_sgetr_lib8.S
new file mode 100644
index 0000000..745c42e
--- /dev/null
+++ b/kernel/avx/kernel_sgetr_lib8.S
@@ -0,0 +1,2476 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGETR_8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgetr_8_lib8, @function
+inner_kernel_sgetr_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgetr_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$7, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	subl		$8, %r10d
+	addq		%r12, %r11
+
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 0(%r13)
+	vmovaps		%ymm3, 128(%r13)
+	vshufps		$0xee, %ymm10, %ymm8, %ymm0
+	vshufps		$0xee, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 32(%r13)
+	vmovaps		%ymm3, 160(%r13)
+	vshufps		$0x44, %ymm11, %ymm9, %ymm0
+	vshufps		$0x44, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 64(%r13)
+	vmovaps		%ymm3, 192(%r13)
+	vshufps		$0xee, %ymm11, %ymm9, %ymm0
+	vshufps		$0xee, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmovaps		%ymm2, 96(%r13)
+	vmovaps		%ymm3, 224(%r13)
+
+	addq		$256, %r13
+
+	cmpl		$7, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	// 0
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm8
+	vmovaps		%ymm8, 0(%r13)
+	cmpl	$1, %r10d
+	jle		3f
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmovaps		%ymm8, 32(%r13)
+	cmpl	$2, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmovaps		%ymm8, 64(%r13)
+	cmpl	$3, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmovaps		%ymm8, 96(%r13)
+	cmpl	$4, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmovaps		%ymm8, 128(%r13)
+	cmpl	$5, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmovaps		%ymm8, 160(%r13)
+	cmpl	$6, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmovaps		%ymm8, 192(%r13)
+//	cmpl	$7, %r10d
+//	jle		3f
+	// 7
+//	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+//	vmovaps		%ymm8, 224(%r13)
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d // kleft*sizeof(float)
+	addq	%r14, %r11 // A+kleft
+	movl	%r10d, %r14d
+	sall	$5, %r14d // kleft*bs*sizeof(float)
+	addq	%r14, %r13
+	movl	$0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgetr_8_lib8, .-inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_sgetr_8_gen_lib8, @function
+inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_sgetr_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_gen_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$7, %r10d
+	jle		0f // consider clean-up
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	subl		$8, %r10d
+	addq		%r12, %r11
+
+	vmovupd		-32(%rsp), %ymm4
+
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 0(%r13)
+	vmaskmovps	%ymm3, %ymm4, 128(%r13)
+	vshufps		$0xee, %ymm10, %ymm8, %ymm0
+	vshufps		$0xee, %ymm14, %ymm12, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 32(%r13)
+	vmaskmovps	%ymm3, %ymm4, 160(%r13)
+	vshufps		$0x44, %ymm11, %ymm9, %ymm0
+	vshufps		$0x44, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 64(%r13)
+	vmaskmovps	%ymm3, %ymm4, 192(%r13)
+	vshufps		$0xee, %ymm11, %ymm9, %ymm0
+	vshufps		$0xee, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm2
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm3
+	vmaskmovps	%ymm2, %ymm4, 96(%r13)
+	vmaskmovps	%ymm3, %ymm4, 224(%r13)
+
+	addq		$256, %r13
+
+	cmpl		$7, %r10d
+	jg			1b // main loop 
+
+0: // consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	vperm2f128	$0x20, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	cmpl	$1, %r10d
+	jle		3f
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	cmpl	$2, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	cmpl	$3, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	cmpl	$4, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	cmpl	$5, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	cmpl	$6, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 192(%r13)
+//	cmpl	$7, %r10d
+//	jle		3f
+	// 7
+//	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+//	vmaskmovps	%ymm8, %ymm9, 224(%r13)
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d // kleft*sizeof(float)
+	addq	%r14, %r11 // A+kleft
+	movl	%r10d, %r14d
+	sall	$5, %r14d // kleft*bs*sizeof(float)
+	addq	%r14, %r13
+	movl	$0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_sgetr_8_gen_lib8, .-inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+// r14d   <- m1
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_0_gen_lib8, @function
+inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_0_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_0_gen_lib8, .-inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_1_gen_lib8, @function
+inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_1_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	vperm2f128	$0x20, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 192(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$224, %r13 // B+7*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_1_gen_lib8, .-inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_2_gen_lib8, @function
+inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_2_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	vperm2f128	$0x20, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 160(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$192, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_2_gen_lib8, .-inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_3_gen_lib8, @function
+inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_3_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	vperm2f128	$0x20, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 128(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$160, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_3_gen_lib8, .-inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_4_gen_lib8, @function
+inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	vperm2f128	$0x31, %ymm1, %ymm0, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 96(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$128, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_4_gen_lib8, .-inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_5_gen_lib8, @function
+inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_5_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	vperm2f128	$0x31, %ymm3, %ymm2, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 64(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$96, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_5_gen_lib8, .-inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_6_gen_lib8, @function
+inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_6_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	// 6
+	vperm2f128	$0x31, %ymm5, %ymm4, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jle		3f
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 32(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$64, %r13 // B+6*bs*sizeof(float)
+
+	jmp		2f
+
+3:
+	movl	%r10d, %r14d
+	sall	$2, %r14d
+	addq	%r14, %r11 // A+k*sizeof(float)
+	movl	%r10d, %r14d
+	sall	$5, %r14d
+	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_6_gen_lib8, .-inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d   <- k
+// r11    <- A
+// r12    <- 8*sda*sizeof(float)
+// r13    <- B
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_sgetr_8_7_gen_lib8, @function
+inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_7_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+	vmovupd		%ymm15, -32(%rsp) // spill mask to stack
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// common
+	vmovaps		0(%r11), %ymm0
+	vmovaps		32(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm8
+	vunpckhps	%ymm1, %ymm0, %ymm9
+	vmovaps		64(%r11), %ymm0
+	vmovaps		96(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm10
+	vunpckhps	%ymm1, %ymm0, %ymm11
+	vmovaps		128(%r11), %ymm0
+	vmovaps		160(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm12
+	vunpckhps	%ymm1, %ymm0, %ymm13
+	vmovaps		192(%r11), %ymm0
+	vmovaps		224(%r11), %ymm1
+	vunpcklps	%ymm1, %ymm0, %ymm14
+	vunpckhps	%ymm1, %ymm0, %ymm15
+	vshufps		$0x44, %ymm10, %ymm8, %ymm0
+	vshufps		$0x44, %ymm14, %ymm12, %ymm1
+	vshufps		$0xee, %ymm10, %ymm8, %ymm2
+	vshufps		$0xee, %ymm14, %ymm12, %ymm3
+	vshufps		$0x44, %ymm11, %ymm9, %ymm4
+	vshufps		$0x44, %ymm15, %ymm13, %ymm5
+	vshufps		$0xee, %ymm11, %ymm9, %ymm6
+	vshufps		$0xee, %ymm15, %ymm13, %ymm7
+
+	vmovupd		-32(%rsp), %ymm9
+
+	// 0
+	// 1
+	// 2
+	// 3
+	// 4
+	// 5
+	// 6
+	// 7
+	vperm2f128	$0x31, %ymm7, %ymm6, %ymm8
+	vmaskmovps	%ymm8, %ymm9, 0(%r13)
+	subl	$1, %r10d
+
+	addq	%r12, %r11 // A+bs*sda*sizeof(float)
+	addq	$32, %r13 // B+6*bs*sizeof(float)
+
+//	jmp		2f
+//
+//3:
+//	movl	%r10d, %r14d
+//	sall	$2, %r14d
+//	addq	%r14, %r11 // A+k*sizeof(float)
+//	movl	%r10d, %r14d
+//	sall	$5, %r14d
+//	addq	%r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_sgetr_8_7_gen_lib8, .-inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_0_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_0_lib8
+	.type kernel_sgetr_8_0_lib8, @function
+kernel_sgetr_8_0_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_0_lib8
+_kernel_sgetr_8_0_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_0_lib8
+	.def kernel_sgetr_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+
+	// offsetA==0: no edge
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_0_lib8, .-kernel_sgetr_8_0_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_0_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_0_gen_lib8
+	.type kernel_sgetr_8_0_gen_lib8, @function
+kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_0_gen_lib8
+_kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_0_gen_lib8
+	.def kernel_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==0: edge to compute mask
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_0_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_0_gen_lib8, .-kernel_sgetr_8_0_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_1_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_1_lib8
+	.type kernel_sgetr_8_1_lib8, @function
+kernel_sgetr_8_1_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_1_lib8
+_kernel_sgetr_8_1_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_1_lib8
+	.def kernel_sgetr_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_1_lib8, .-kernel_sgetr_8_1_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_1_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_1_gen_lib8
+	.type kernel_sgetr_8_1_gen_lib8, @function
+kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_1_gen_lib8
+_kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_1_gen_lib8
+	.def kernel_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_1_gen_lib8, .-kernel_sgetr_8_1_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_2_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_2_lib8
+	.type kernel_sgetr_8_2_lib8, @function
+kernel_sgetr_8_2_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_2_lib8
+_kernel_sgetr_8_2_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_2_lib8
+	.def kernel_sgetr_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_2_lib8, .-kernel_sgetr_8_2_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_2_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_2_gen_lib8
+	.type kernel_sgetr_8_2_gen_lib8, @function
+kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_2_gen_lib8
+_kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_2_gen_lib8
+	.def kernel_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_2_gen_lib8, .-kernel_sgetr_8_2_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_3_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_3_lib8
+	.type kernel_sgetr_8_3_lib8, @function
+kernel_sgetr_8_3_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_3_lib8
+_kernel_sgetr_8_3_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_3_lib8
+	.def kernel_sgetr_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_3_lib8, .-kernel_sgetr_8_3_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_3_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_3_gen_lib8
+	.type kernel_sgetr_8_3_gen_lib8, @function
+kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_3_gen_lib8
+_kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_3_gen_lib8
+	.def kernel_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_3_gen_lib8, .-kernel_sgetr_8_3_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_4_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_4_lib8
+	.type kernel_sgetr_8_4_lib8, @function
+kernel_sgetr_8_4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_4_lib8
+_kernel_sgetr_8_4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_4_lib8
+	.def kernel_sgetr_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_4_lib8, .-kernel_sgetr_8_4_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_4_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_4_gen_lib8
+	.type kernel_sgetr_8_4_gen_lib8, @function
+kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_4_gen_lib8
+_kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_4_gen_lib8
+	.def kernel_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_4_gen_lib8, .-kernel_sgetr_8_4_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_5_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_5_lib8
+	.type kernel_sgetr_8_5_lib8, @function
+kernel_sgetr_8_5_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_5_lib8
+_kernel_sgetr_8_5_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_5_lib8
+	.def kernel_sgetr_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_5_lib8, .-kernel_sgetr_8_5_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_5_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_5_gen_lib8
+	.type kernel_sgetr_8_5_gen_lib8, @function
+kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_5_gen_lib8
+_kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_5_gen_lib8
+	.def kernel_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_5_gen_lib8, .-kernel_sgetr_8_5_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_6_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_6_lib8
+	.type kernel_sgetr_8_6_lib8, @function
+kernel_sgetr_8_6_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_6_lib8
+_kernel_sgetr_8_6_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_6_lib8
+	.def kernel_sgetr_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_6_lib8, .-kernel_sgetr_8_6_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_6_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_6_gen_lib8
+	.type kernel_sgetr_8_6_gen_lib8, @function
+kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_6_gen_lib8
+_kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_6_gen_lib8
+	.def kernel_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_6_gen_lib8, .-kernel_sgetr_8_6_gen_lib8
+#endif
+
+
+
+
+
+//                            rdi    rsi       rdx      rcx
+// void kernel_sgetr_8_7_lib8(int k, float *A, int sda, float *B);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_7_lib8
+	.type kernel_sgetr_8_7_lib8, @function
+kernel_sgetr_8_7_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_7_lib8
+_kernel_sgetr_8_7_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_7_lib8
+	.def kernel_sgetr_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	$8, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_7_lib8, .-kernel_sgetr_8_7_lib8
+#endif
+
+
+
+
+
+//                                rdi    rsi       rdx      rcx       r8
+// void kernel_sgetr_8_7_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgetr_8_7_gen_lib8
+	.type kernel_sgetr_8_7_gen_lib8, @function
+kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgetr_8_7_gen_lib8
+_kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgetr_8_7_gen_lib8
+	.def kernel_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13  // B
+	movq	ARG5, %r14  // m1
+
+	// offsetA==1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgetr_8_7_gen_lib8, .-kernel_sgetr_8_7_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/Makefile b/kernel/avx2/Makefile
new file mode 100644
index 0000000..adb91c4
--- /dev/null
+++ b/kernel/avx2/Makefile
@@ -0,0 +1,48 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_8x4_lib4.o kernel_dgemm_8x8_lib4.o kernel_dgemm_12x4_lib4.o kernel_dgemv_8_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgebp_lib4.o kernel_dgelqf_4_lib4.o
+OBJS += kernel_sgemm_24x4_lib8.o kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
diff --git a/kernel/avx2/kernel_dgebp_lib4.S b/kernel/avx2/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..4093b23
--- /dev/null
+++ b/kernel/avx2/kernel_dgebp_lib4.S
@@ -0,0 +1,2741 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+//                               1      2          3        4          5
+// void kernel_dger4_sub_12_lib4(int k, double *A, int sda, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_12r_lib4
+	.type kernel_dger4_sub_12r_lib4, @function
+kernel_dger4_sub_12r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_12r_lib4
+_kernel_dger4_sub_12r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_12r_lib4
+	.def kernel_dger4_sub_12r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmovapd	0(%r11, %r12, 1), %ymm4
+	vmovapd	32(%r11, %r12, 1), %ymm5
+	vmovapd	64(%r11, %r12, 1), %ymm6
+	vmovapd	96(%r11, %r12, 1), %ymm7
+
+	vmovapd	0(%r11, %r12, 2), %ymm8
+	vmovapd	32(%r11, %r12, 2), %ymm9
+	vmovapd	64(%r11, %r12, 2), %ymm10
+	vmovapd	96(%r11, %r12, 2), %ymm11
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm12
+	vmovapd			0(%r14, %r15, 1), %ymm13
+	vmovapd			0(%r14, %r15, 2), %ymm14
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 0(%r14)
+	vmovapd			%ymm13, 0(%r14, %r15, 1)
+	vmovapd			%ymm14, 0(%r14, %r15, 2)
+
+	vmovapd			32(%r14), %ymm12
+	vmovapd			32(%r14, %r15, 1), %ymm13
+	vmovapd			32(%r14, %r15, 2), %ymm14
+	vbroadcastsd	32(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	40(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	48(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	56(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 32(%r14)
+	vmovapd			%ymm13, 32(%r14, %r15, 1)
+	vmovapd			%ymm14, 32(%r14, %r15, 2)
+
+	vmovapd			64(%r14), %ymm12
+	vmovapd			64(%r14, %r15, 1), %ymm13
+	vmovapd			64(%r14, %r15, 2), %ymm14
+	vbroadcastsd	64(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	72(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	80(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	88(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 64(%r14)
+	vmovapd			%ymm13, 64(%r14, %r15, 1)
+	vmovapd			%ymm14, 64(%r14, %r15, 2)
+
+	vmovapd			96(%r14), %ymm12
+	vmovapd			96(%r14, %r15, 1), %ymm13
+	vmovapd			96(%r14, %r15, 2), %ymm14
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	-24(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	-16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, -32(%r14)
+	vmovapd			%ymm13, -32(%r14, %r15, 1)
+	vmovapd			%ymm14, -32(%r14, %r15, 2)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm12
+	vmovapd			0(%r14, %r15, 1), %ymm13
+	vmovapd			0(%r14, %r15, 2), %ymm14
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	8(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 0(%r14)
+	vmovapd			%ymm13, 0(%r14, %r15, 1)
+	vmovapd			%ymm14, 0(%r14, %r15, 2)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_12r_lib4, .-kernel_dger4_sub_12r_lib4
+#endif
+
+
+
+
+
+//                                  1      2          3        4          5          6        7
+// void kernel_dger4_sub_12_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_12r_vs_lib4
+	.type kernel_dger4_sub_12r_vs_lib4, @function
+kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_12r_vs_lib4
+_kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_12r_vs_lib4
+	.def kernel_dger4_sub_12r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+	movq	ARG7, %rax // km
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmovapd	0(%r11, %r12, 1), %ymm4
+	vmovapd	32(%r11, %r12, 1), %ymm5
+	vmovapd	64(%r11, %r12, 1), %ymm6
+	vmovapd	96(%r11, %r12, 1), %ymm7
+
+	vmaskmovpd	0(%r11, %r12, 2), %ymm15, %ymm8
+	vmaskmovpd	32(%r11, %r12, 2), %ymm15, %ymm9
+	vmaskmovpd	64(%r11, %r12, 2), %ymm15, %ymm10
+	vmaskmovpd	96(%r11, %r12, 2), %ymm15, %ymm11
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm12
+	vmovapd			0(%r14, %r15, 1), %ymm13
+	vmovapd			0(%r14, %r15, 2), %ymm14
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 0(%r14)
+	vmovapd			%ymm13, 0(%r14, %r15, 1)
+	vmovapd			%ymm14, 0(%r14, %r15, 2)
+
+	vmovapd			32(%r14), %ymm12
+	vmovapd			32(%r14, %r15, 1), %ymm13
+	vmovapd			32(%r14, %r15, 2), %ymm14
+	vbroadcastsd	32(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	40(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	48(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	56(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 32(%r14)
+	vmovapd			%ymm13, 32(%r14, %r15, 1)
+	vmovapd			%ymm14, 32(%r14, %r15, 2)
+
+	vmovapd			64(%r14), %ymm12
+	vmovapd			64(%r14, %r15, 1), %ymm13
+	vmovapd			64(%r14, %r15, 2), %ymm14
+	vbroadcastsd	64(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	72(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	80(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	88(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 64(%r14)
+	vmovapd			%ymm13, 64(%r14, %r15, 1)
+	vmovapd			%ymm14, 64(%r14, %r15, 2)
+
+	vmovapd			96(%r14), %ymm12
+	vmovapd			96(%r14, %r15, 1), %ymm13
+	vmovapd			96(%r14, %r15, 2), %ymm14
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	-24(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	-16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, -32(%r14)
+	vmovapd			%ymm13, -32(%r14, %r15, 1)
+	vmovapd			%ymm14, -32(%r14, %r15, 2)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm12
+	vmovapd			0(%r14, %r15, 1), %ymm13
+	vmovapd			0(%r14, %r15, 2), %ymm14
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm12
+	vfnmadd231pd	%ymm4, %ymm15, %ymm13
+	vfnmadd231pd	%ymm8, %ymm15, %ymm14
+	vbroadcastsd	8(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm12
+	vfnmadd231pd	%ymm5, %ymm15, %ymm13
+	vfnmadd231pd	%ymm9, %ymm15, %ymm14
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm12
+	vfnmadd231pd	%ymm6, %ymm15, %ymm13
+	vfnmadd231pd	%ymm10, %ymm15, %ymm14
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm12
+	vfnmadd231pd	%ymm7, %ymm15, %ymm13
+	vfnmadd231pd	%ymm11, %ymm15, %ymm14
+	vmovapd			%ymm12, 0(%r14)
+	vmovapd			%ymm13, 0(%r14, %r15, 1)
+	vmovapd			%ymm14, 0(%r14, %r15, 2)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_12r_vs_lib4, .-kernel_dger4_sub_12r_vs_lib4
+#endif
+
+
+
+
+
+//                              1      2          3        4          5
+// void kernel_dger4_sub_8_lib4(int k, double *A, int sda, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_lib4
+	.type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_lib4
+	.def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmovapd	0(%r11, %r12, 1), %ymm4
+	vmovapd	32(%r11, %r12, 1), %ymm5
+	vmovapd	64(%r11, %r12, 1), %ymm6
+	vmovapd	96(%r11, %r12, 1), %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3        4          5          6        7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_8r_vs_lib4
+	.def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // C
+	movq	ARG6, %r15 // C
+	sall	$5, %r15d // 4*sdc*sizeof(double)
+	movq	ARG7, %rax // km
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC01(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	vmaskmovpd	0(%r11, %r12, 1), %ymm15, %ymm4
+	vmaskmovpd	32(%r11, %r12, 1), %ymm15, %ymm5
+	vmaskmovpd	64(%r11, %r12, 1), %ymm15, %ymm6
+	vmaskmovpd	96(%r11, %r12, 1), %ymm15, %ymm7
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	vmovapd			32(%r14), %ymm8
+	vmovapd			32(%r14, %r15, 1), %ymm9
+	vbroadcastsd	32(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	40(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	48(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	56(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 32(%r14)
+	vmovapd			%ymm9, 32(%r14, %r15, 1)
+
+	vmovapd			64(%r14), %ymm8
+	vmovapd			64(%r14, %r15, 1), %ymm9
+	vbroadcastsd	64(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	72(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	80(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	88(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 64(%r14)
+	vmovapd			%ymm9, 64(%r14, %r15, 1)
+
+	vmovapd			96(%r14), %ymm8
+	vmovapd			96(%r14, %r15, 1), %ymm9
+	vbroadcastsd	96(%r13), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	-24(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	-16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	-8(%r13), %ymm15
+	addq	$128, %r14
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, -32(%r14)
+	vmovapd			%ymm9, -32(%r14, %r15, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm8
+	vmovapd			0(%r14, %r15, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm8
+	vfnmadd231pd	%ymm4, %ymm15, %ymm9
+	vbroadcastsd	8(%r13), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm8
+	vfnmadd231pd	%ymm5, %ymm15, %ymm9
+	vbroadcastsd	16(%r13), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm8
+	vfnmadd231pd	%ymm6, %ymm15, %ymm9
+	vbroadcastsd	24(%r13), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm8
+	vfnmadd231pd	%ymm7, %ymm15, %ymm9
+	vmovapd			%ymm8, 0(%r14)
+	vmovapd			%ymm9, 0(%r14, %r15, 1)
+
+	addq	$32, %r13
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+//                               1      2          3          4        5
+// void kernel_dger12_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger12_add_4r_lib4
+	.type kernel_dger12_add_4r_lib4, @function
+kernel_dger12_add_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger12_add_4r_lib4
+_kernel_dger12_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger12_add_4r_lib4
+	.def kernel_dger12_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger12_add_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // n
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d
+	movq	ARG5, %r14 // C
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$11, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+	vmovapd	128(%r14), %ymm4
+	vmovapd	160(%r14), %ymm5
+	vmovapd	192(%r14), %ymm6
+	vmovapd	224(%r14), %ymm7
+	vmovapd	256(%r14), %ymm8
+	vmovapd	288(%r14), %ymm9
+	vmovapd	320(%r14), %ymm10
+	vmovapd	352(%r14), %ymm11
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+	vmovapd	%ymm4, 128(%r14)
+	vmovapd	%ymm5, 160(%r14)
+	vmovapd	%ymm6, 192(%r14)
+	vmovapd	%ymm7, 224(%r14)
+	vmovapd	%ymm8, 256(%r14)
+	vmovapd	%ymm9, 288(%r14)
+	vmovapd	%ymm10, 320(%r14)
+	vmovapd	%ymm11, 352(%r14)
+
+	addq	$384, %r12
+	addq	$384, %r14
+	subl	$12, %r10d
+
+	cmpl	$11, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$3, %r10d
+	jle		2f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+
+	addq	$128, %r12
+	addq	$128, %r14
+	subl	$4, %r10d
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+
+	addq	$32, %r12
+	addq	$32, %r14
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		1b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger12_add_4r_lib4, .-kernel_dger12_add_4r_lib4
+#endif
+
+
+
+
+
+//                               1      2          3          4        5
+// void kernel_dger8_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger8_add_4r_lib4
+	.type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger8_add_4r_lib4
+	.def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // n
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d
+	movq	ARG5, %r14 // C
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$11, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+	vmovapd	128(%r14), %ymm4
+	vmovapd	160(%r14), %ymm5
+	vmovapd	192(%r14), %ymm6
+	vmovapd	224(%r14), %ymm7
+	vmovapd	256(%r14), %ymm8
+	vmovapd	288(%r14), %ymm9
+	vmovapd	320(%r14), %ymm10
+	vmovapd	352(%r14), %ymm11
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+	vmovapd	%ymm4, 128(%r14)
+	vmovapd	%ymm5, 160(%r14)
+	vmovapd	%ymm6, 192(%r14)
+	vmovapd	%ymm7, 224(%r14)
+	vmovapd	%ymm8, 256(%r14)
+	vmovapd	%ymm9, 288(%r14)
+	vmovapd	%ymm10, 320(%r14)
+	vmovapd	%ymm11, 352(%r14)
+
+	addq	$384, %r12
+	addq	$384, %r14
+	subl	$12, %r10d
+
+	cmpl	$11, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$3, %r10d
+	jle		2f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+
+	addq	$128, %r12
+	addq	$128, %r14
+	subl	$4, %r10d
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+
+	addq	$32, %r12
+	addq	$32, %r14
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		1b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+
+
+
+
+
+#if 0
+//                               1      2          3          4        5
+// void kernel_dger8_sub_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger8_add_4r_lib4
+	.type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger8_add_4r_lib4
+	.def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+	sall	$5, %r13d
+	movq	ARG5, %r14
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+	vmovapd	128(%r11), %ymm4
+	vmovapd	160(%r11), %ymm5
+	vmovapd	192(%r11), %ymm6
+	vmovapd	224(%r11), %ymm7
+
+	cmpl	$7, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	// 04
+	vmovapd			0(%r14), %ymm12
+	vbroadcastsd	0(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	8(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	16(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	24(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 0(%r14)
+
+	// 14
+	vmovapd			32(%r14), %ymm12
+	vbroadcastsd	32(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	40(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	48(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	56(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 32(%r14)
+
+	// 24
+	vmovapd			64(%r14), %ymm12
+	vbroadcastsd	64(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	72(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	80(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	88(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 64(%r14)
+
+	// 34
+	vmovapd			96(%r14), %ymm12
+	vbroadcastsd	96(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	104(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	112(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	120(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 96(%r14)
+
+	// 44
+	vmovapd			128(%r14), %ymm12
+	vbroadcastsd	128(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	136(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	144(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	152(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 128(%r14)
+
+	// 54
+	vmovapd			160(%r14), %ymm12
+	vbroadcastsd	160(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	168(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	176(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	184(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 160(%r14)
+
+	// 64
+	vmovapd			192(%r14), %ymm12
+	vbroadcastsd	192(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	200(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	208(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	216(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 192(%r14)
+
+	// 74
+	vmovapd			224(%r14), %ymm12
+	vbroadcastsd	224(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	232(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	240(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	248(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vmovapd			%ymm12, 224(%r14)
+
+	// 08
+	vmovapd			0(%r14), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 0(%r14)
+
+	// 18
+	vmovapd			32(%r14), %ymm12
+	vbroadcastsd	32(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	40(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	48(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	56(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 32(%r14)
+
+	// 28
+	vmovapd			64(%r14), %ymm12
+	vbroadcastsd	64(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	71(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	80(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	88(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 64(%r14)
+
+	// 38
+	vmovapd			96(%r14), %ymm12
+	vbroadcastsd	96(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	104(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	112(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	120(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 96(%r14)
+
+	// 48
+	vmovapd			128(%r14), %ymm12
+	vbroadcastsd	128(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	136(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	144(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	152(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 128(%r14)
+
+	// 58
+	vmovapd			160(%r14), %ymm12
+	vbroadcastsd	160(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	168(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	176(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	184(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 160(%r14)
+
+	// 68
+	vmovapd			192(%r14), %ymm12
+	vbroadcastsd	192(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	200(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	208(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	216(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 192(%r14)
+
+	// 78
+	vmovapd			224(%r14), %ymm12
+	vbroadcastsd	224(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	232(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	240(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	248(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 224(%r14)
+
+	addq	$256, %r12
+	addq	$256, %r14
+	subl	$8, %r10d
+
+	cmpl	$7, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r14), %ymm12
+	vbroadcastsd	0(%r12), %ymm15
+	vfmadd231pd		%ymm0, %ymm15, %ymm12
+	vbroadcastsd	8(%r12), %ymm15
+	vfmadd231pd		%ymm1, %ymm15, %ymm12
+	vbroadcastsd	16(%r12), %ymm15
+	vfmadd231pd		%ymm2, %ymm15, %ymm12
+	vbroadcastsd	24(%r12), %ymm15
+	vfmadd231pd		%ymm3, %ymm15, %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm4, %ymm15, %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm5, %ymm15, %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm6, %ymm15, %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm15
+	vfmadd231pd		%ymm7, %ymm15, %ymm12
+	vmovapd			%ymm12, 0(%r14)
+
+	addq	$32, %r12
+	addq	$32, %r14
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+#endif
+
+
+
+
+
+//                              1      2          3          4
+// void kernel_dger4_sub_4_lib4(int n, double *A, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_lib4
+	.type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_lib4
+	.def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+	vmovapd	64(%r11), %ymm2
+	vmovapd	96(%r11), %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+//                              1      2          3          4
+// void kernel_dger2_sub_4_lib4(int n, double *A, double *B, double *C)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger2_sub_4r_lib4
+	.type kernel_dger2_sub_4r_lib4, @function
+kernel_dger2_sub_4r_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger2_sub_4r_lib4
+_kernel_dger2_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger2_sub_4r_lib4
+	.def kernel_dger2_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger2_sub_4r_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// load block from A
+	vmovapd	0(%r11), %ymm0
+	vmovapd	32(%r11), %ymm1
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	addq	$128, %r13
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger2_sub_4r_lib4, .-kernel_dger2_sub_4r_lib4
+#endif
+
+
+
+
+
+//                                 1      2          3          4          5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dger4_sub_4r_vs_lib4
+	.def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC00(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	// load block from A
+	vmaskmovpd	0(%r11), %ymm15, %ymm0
+	vmaskmovpd	32(%r11), %ymm15, %ymm1
+	vmaskmovpd	64(%r11), %ymm15, %ymm2
+	vmaskmovpd	96(%r11), %ymm15, %ymm3
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	vmovapd			32(%r13), %ymm4
+	vbroadcastsd	32(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	40(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	48(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	56(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 32(%r13)
+
+	vmovapd			64(%r13), %ymm4
+	vbroadcastsd	64(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	72(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	80(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	88(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 64(%r13)
+
+	vmovapd			96(%r13), %ymm4
+	vbroadcastsd	96(%r12), %ymm15
+	addq	$128, %r12
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	-24(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	-16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	-8(%r12), %ymm15
+	addq	$128, %r13
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, -32(%r13)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r13), %ymm4
+	vbroadcastsd	0(%r12), %ymm15
+	vfnmadd231pd	%ymm0, %ymm15, %ymm4
+	vbroadcastsd	8(%r12), %ymm15
+	vfnmadd231pd	%ymm1, %ymm15, %ymm4
+	vbroadcastsd	16(%r12), %ymm15
+	vfnmadd231pd	%ymm2, %ymm15, %ymm4
+	vbroadcastsd	24(%r12), %ymm15
+	vfnmadd231pd	%ymm3, %ymm15, %ymm4
+	vmovapd			%ymm4, 0(%r13)
+
+	addq	$32, %r12
+	addq	$32, %r13
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+	.align 5
+#endif
+	.double 0.5
+	.double 1.5
+	.double 2.5
+	.double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+	.align 5
+#endif
+	.double 4.5
+	.double 5.5
+	.double 6.5
+	.double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+	.align 5
+#endif
+	.double 8.5
+	.double 9.5
+	.double 10.5
+	.double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4.S b/kernel/avx2/kernel_dgelqf_4_lib4.S
new file mode 100644
index 0000000..2f8b1be
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4.S
@@ -0,0 +1,5728 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+//                                   1      2           3        4           5
+// void kernel_dgelqf_dlarft12_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgelqf_dlarft12_12_lib4
+	.type kernel_dgelqf_dlarft12_12_lib4, @function
+kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgelqf_dlarft12_12_lib4
+_kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgelqf_dlarft12_12_lib4
+	.def kernel_dgelqf_dlarft12_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft12_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero T
+
+	movq	ARG5, %r10 // T
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm15, 0(%r10)
+	vmovapd			%ymm15, 32(%r10)
+	vmovapd			%ymm15, 64(%r10)
+	vmovapd			%ymm15, 96(%r10)
+
+	// first column
+
+	movq	ARG2, %r11 // D
+	movq	ARG3, %r14 // sdd
+	sall	$5, %r14d
+	movq	ARG4, %r12 // dD
+	movq	ARG5, %r13 // T
+	movq	$384, %r15 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+	vxorpd			%xmm15, %xmm15, %xmm15
+	movq	ARG1, %r10 // n
+	subl	$1, %r10d
+	addq	$32, %r11
+100:
+	vmovsd			0(%r11), %xmm14
+	vfmadd231sd		%xmm14, %xmm14, %xmm15
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		100b
+
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		101f
+	vmovsd			%xmm14, 0(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			0(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 0(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 0(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r14, 1), %ymm1
+	vmovapd			0(%r11, %r14, 2), %ymm2
+	vbroadcastsd	32(%r11), %ymm8
+	vbroadcastsd	64(%r11), %ymm9
+	vbroadcastsd	96(%r11), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm8, 32(%r11)
+	vmovsd			%xmm9, 64(%r11)
+	vmovsd			%xmm10, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11), %ymm8
+	vbroadcastsd	32(%r11), %ymm9
+	vbroadcastsd	64(%r11), %ymm10
+	vbroadcastsd	96(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 0(%r11)
+	vmovsd			%xmm9, 32(%r11)
+	vmovsd			%xmm10, 64(%r11)
+	vmovsd			%xmm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm15, %ymm2, %ymm2
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// second column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 8(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			40(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 8(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 40(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vmovapd			32(%r11, %r14, 1), %ymm1
+	vmovapd			32(%r11, %r14, 2), %ymm2
+	vbroadcastsd	72(%r11), %ymm9
+	vbroadcastsd	104(%r11), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm9, 72(%r11)
+	vmovsd			%xmm10, 104(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11), %ymm8
+	vbroadcastsd	40(%r11), %ymm9
+	vbroadcastsd	72(%r11), %ymm10
+	vbroadcastsd	104(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 8(%r11)
+	vmovsd			%xmm9, 40(%r11)
+	vmovsd			%xmm10, 72(%r11)
+	vmovsd			%xmm11, 104(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 8(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC02(%rip), %ymm12
+#else
+	vmovapd			LC02(%rip), %ymm12
+#endif
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm15, %ymm2, %ymm2
+	vmovsd			%xmm0, 32(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x3, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	40(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// third column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 16(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			80(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 16(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 80(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vmovapd			64(%r11, %r14, 1), %ymm1
+	vmovapd			64(%r11, %r14, 2), %ymm2
+	vbroadcastsd	112(%r11), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm10, 112(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11), %ymm8
+	vbroadcastsd	48(%r11), %ymm9
+	vbroadcastsd	80(%r11), %ymm10
+	vbroadcastsd	112(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 16(%r11)
+	vmovsd			%xmm9, 48(%r11)
+	vmovsd			%xmm10, 80(%r11)
+	vmovsd			%xmm11, 112(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 16(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vbroadcastsd	80(%r13), %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			%xmm0, 64(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x7, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	48(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	80(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+	// fourth column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 24(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			120(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 24(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 120(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	vmovapd			96(%r11, %r14, 1), %ymm1
+	vmovapd			96(%r11, %r14, 2), %ymm2
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11), %ymm8
+	vbroadcastsd	56(%r11), %ymm9
+	vbroadcastsd	88(%r11), %ymm10
+	vbroadcastsd	120(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 24(%r11)
+	vmovsd			%xmm9, 56(%r11)
+	vmovsd			%xmm10, 88(%r11)
+	vmovsd			%xmm11, 120(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 24(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vbroadcastsd	120(%r13), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			96(%r13), %ymm0
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vmovapd			%ymm0, 96(%r13)
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	56(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	88(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	56(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	88(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+//	vpermpd	$0x00, %ymm15, %ymm15  // beta
+
+	// fifth column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 32(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$128, %r11
+	vmovsd			0(%r11, %r14, 1), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11, %r14, 1) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 32(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 128(%r13, %r15, 1) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r14, 1), %ymm1
+	vmovapd			0(%r11, %r14, 2), %ymm2
+	vbroadcastsd	32(%r11, %r14, 1), %ymm8
+	vbroadcastsd	64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	96(%r11, %r14, 1), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm8, 32(%r11, %r14, 1)
+	vmovsd			%xmm9, 64(%r11, %r14, 1)
+	vmovsd			%xmm10, 96(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$8, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11, %r14, 1), %ymm8
+	vbroadcastsd	32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	64(%r11, %r14, 1), %ymm10
+	vbroadcastsd	96(%r11, %r14, 1), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 0(%r11, %r14, 1)
+	vmovsd			%xmm9, 32(%r11, %r14, 1)
+	vmovsd			%xmm10, 64(%r11, %r14, 1)
+	vmovsd			%xmm11, 96(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11, %r14, 1), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 0(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			96(%r13), %ymm14
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vbroadcastsd	128(%r13, %r15, 1), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+//	vmovapd			128(%r13), %ymm0
+//	vblendpd		$0xf, %ymm15, %ymm0, %ymm15
+	vmovapd			%ymm15, 128(%r13)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm1, %ymm1
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// sixth column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$128, %r11
+	vmovsd			40(%r11, %r14, 1), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11, %r14, 1) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 40(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 168(%r13, %r15, 1) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vmovapd			32(%r11, %r14, 1), %ymm1
+	vmovapd			32(%r11, %r14, 2), %ymm2
+	vbroadcastsd	72(%r11, %r14, 1), %ymm9
+	vbroadcastsd	104(%r11, %r14, 1), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm9, 72(%r11, %r14, 1)
+	vmovsd			%xmm10, 104(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$8, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11, %r14, 1), %ymm8
+	vbroadcastsd	40(%r11, %r14, 1), %ymm9
+	vbroadcastsd	72(%r11, %r14, 1), %ymm10
+	vbroadcastsd	104(%r11, %r14, 1), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 8(%r11, %r14, 1)
+	vmovsd			%xmm9, 40(%r11, %r14, 1)
+	vmovsd			%xmm10, 72(%r11, %r14, 1)
+	vmovsd			%xmm11, 104(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11, %r14, 1), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 8(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			96(%r13), %ymm14
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vmovapd			128(%r13), %ymm14
+	vmovapd			128(%r13, %r15, 1), %ymm11
+	vblendpd		$0x1, %ymm11, %ymm12, %ymm11
+	vpermpd			$0x00, %ymm1, %ymm13 // vv
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmulpd			%ymm11, %ymm13, %ymm11
+	//
+	vbroadcastsd	168(%r13, %r15, 1), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			160(%r13, %r15, 1), %ymm0
+	vblendpd		$0x1, %ymm11, %ymm0, %ymm11
+	vmovapd			%ymm15, 160(%r13)
+	vmovapd			%ymm11, 160(%r13, %r15, 1)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x3, %ymm15, %ymm1, %ymm1
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	40(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// seventh column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$128, %r11
+	vmovsd			80(%r11, %r14, 1), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11, %r14, 1) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 48(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 208(%r13, %r15, 1) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vmovapd			64(%r11, %r14, 1), %ymm1
+	vmovapd			64(%r11, %r14, 2), %ymm2
+	vbroadcastsd	112(%r11, %r14, 1), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm10, 112(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$8, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11, %r14, 1), %ymm8
+	vbroadcastsd	48(%r11, %r14, 1), %ymm9
+	vbroadcastsd	80(%r11, %r14, 1), %ymm10
+	vbroadcastsd	112(%r11, %r14, 1), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 16(%r11, %r14, 1)
+	vmovsd			%xmm9, 48(%r11, %r14, 1)
+	vmovsd			%xmm10, 80(%r11, %r14, 1)
+	vmovsd			%xmm11, 112(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11, %r14, 1), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 16(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13 // vv
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13 // vv
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vbroadcastsd	208(%r13, %r15, 1), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			192(%r13, %r15, 1), %ymm0
+	vblendpd		$0x3, %ymm11, %ymm0, %ymm11
+	vmovapd			%ymm15, 192(%r13)
+	vmovapd			%ymm11, 192(%r13, %r15, 1)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x7, %ymm15, %ymm1, %ymm1
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	48(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	80(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+	// eight column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$128, %r11
+	vmovsd			120(%r11, %r14, 1), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11, %r14, 1) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 56(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 248(%r13, %r15, 1) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	vmovapd			96(%r11, %r14, 1), %ymm1
+	vmovapd			96(%r11, %r14, 2), %ymm2
+	movq	ARG1, %r10 // n
+	subl	$8, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11, %r14, 1), %ymm8
+	vbroadcastsd	56(%r11, %r14, 1), %ymm9
+	vbroadcastsd	88(%r11, %r14, 1), %ymm10
+	vbroadcastsd	120(%r11, %r14, 1), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 24(%r11, %r14, 1)
+	vmovsd			%xmm9, 56(%r11, %r14, 1)
+	vmovsd			%xmm10, 88(%r11, %r14, 1)
+	vmovsd			%xmm11, 120(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11, %r14, 1), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 24(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vmovapd			192(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			192(%r13, %r15, 1), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vbroadcastsd	248(%r13, %r15, 1), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+//	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			224(%r13, %r15, 1), %ymm0
+	vblendpd		$0x7, %ymm11, %ymm0, %ymm11
+	vmovapd			%ymm15, 224(%r13)
+	vmovapd			%ymm11, 224(%r13, %r15, 1)
+
+//	vxorpd			%ymm15, %ymm15, %ymm15
+//	vblendpd		$0xf, %ymm15, %ymm1, %ymm1
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	56(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	88(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	56(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	88(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11, %r14, 1), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+//	vpermpd	$0x00, %ymm15, %ymm15  // beta
+
+	// ninth column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$256, %r11
+	vmovsd			0(%r11, %r14, 2), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11, %r14, 2) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 64(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 256(%r13, %r15, 2) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r14, 1), %ymm1
+	vmovapd			0(%r11, %r14, 2), %ymm2
+	vbroadcastsd	32(%r11, %r14, 2), %ymm8
+	vbroadcastsd	64(%r11, %r14, 2), %ymm9
+	vbroadcastsd	96(%r11, %r14, 2), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm8, 32(%r11, %r14, 2)
+	vmovsd			%xmm9, 64(%r11, %r14, 2)
+	vmovsd			%xmm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$12, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11, %r14, 2), %ymm8
+	vbroadcastsd	32(%r11, %r14, 2), %ymm9
+	vbroadcastsd	64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11, %r14, 2), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 0(%r11, %r14, 2)
+	vmovsd			%xmm9, 32(%r11, %r14, 2)
+	vmovsd			%xmm10, 64(%r11, %r14, 2)
+	vmovsd			%xmm11, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11, %r14, 2), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vmovapd			192(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			192(%r13, %r15, 1), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xff, %ymm1, %ymm13
+	vmovapd			224(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			224(%r13, %r15, 1), %ymm14
+//	vblendpd		$0xf, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vbroadcastsd	256(%r13, %r15, 2), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+//	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+//	vmovapd			224(%r13, %r15, 1), %ymm0
+//	vblendpd		$0xf, %ymm11, %ymm0, %ymm11
+	vmovapd			%ymm15, 256(%r13)
+	vmovapd			%ymm11, 256(%r13, %r15, 1)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm2, %ymm2
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// tenth column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$256, %r11
+	vmovsd			40(%r11, %r14, 2), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11, %r14, 2) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 72(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 296(%r13, %r15, 2) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vmovapd			32(%r11, %r14, 1), %ymm1
+	vmovapd			32(%r11, %r14, 2), %ymm2
+	vbroadcastsd	72(%r11, %r14, 2), %ymm9
+	vbroadcastsd	104(%r11, %r14, 2), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm9, 72(%r11, %r14, 2)
+	vmovsd			%xmm10, 104(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$12, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11, %r14, 2), %ymm8
+	vbroadcastsd	40(%r11, %r14, 2), %ymm9
+	vbroadcastsd	72(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11, %r14, 2), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 8(%r11, %r14, 2)
+	vmovsd			%xmm9, 40(%r11, %r14, 2)
+	vmovsd			%xmm10, 72(%r11, %r14, 2)
+	vmovsd			%xmm11, 104(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11, %r14, 2), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 8(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vmovapd			192(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			192(%r13, %r15, 1), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xff, %ymm1, %ymm13
+	vmovapd			224(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			224(%r13, %r15, 1), %ymm14
+//	vblendpd		$0xf, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x00, %ymm2, %ymm13
+	vmovapd			256(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			256(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			256(%r13, %r15, 2), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm10
+	//
+	vbroadcastsd	296(%r13, %r15, 2), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+	vmulpd			%ymm14, %ymm10, %ymm10
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			288(%r13, %r15, 2), %ymm0
+	vblendpd		$0x1, %ymm10, %ymm0, %ymm10
+	vmovapd			%ymm15, 288(%r13)
+	vmovapd			%ymm11, 288(%r13, %r15, 1)
+	vmovapd			%ymm10, 288(%r13, %r15, 2)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x3, %ymm15, %ymm2, %ymm2
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	40(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// eleventh column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$256, %r11
+	vmovsd			80(%r11, %r14, 2), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11, %r14, 2) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 80(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 336(%r13, %r15, 2) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vmovapd			64(%r11, %r14, 1), %ymm1
+	vmovapd			64(%r11, %r14, 2), %ymm2
+	vbroadcastsd	112(%r11, %r14, 2), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm10, 112(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$12, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11, %r14, 2), %ymm8
+	vbroadcastsd	48(%r11, %r14, 2), %ymm9
+	vbroadcastsd	80(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11, %r14, 2), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 16(%r11, %r14, 2)
+	vmovsd			%xmm9, 48(%r11, %r14, 2)
+	vmovsd			%xmm10, 80(%r11, %r14, 2)
+	vmovsd			%xmm11, 112(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11, %r14, 2), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 16(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vmovapd			192(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			192(%r13, %r15, 1), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xff, %ymm1, %ymm13
+	vmovapd			224(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			224(%r13, %r15, 1), %ymm14
+//	vblendpd		$0xf, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x00, %ymm2, %ymm13
+	vmovapd			256(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			256(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			256(%r13, %r15, 2), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm10
+	//
+	vpermpd			$0x55, %ymm2, %ymm13
+	vmovapd			288(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			288(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			288(%r13, %r15, 2), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm10
+	//
+	vbroadcastsd	336(%r13, %r15, 2), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+	vmulpd			%ymm14, %ymm10, %ymm10
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			320(%r13, %r15, 2), %ymm0
+	vblendpd		$0x3, %ymm10, %ymm0, %ymm10
+	vmovapd			%ymm15, 320(%r13)
+	vmovapd			%ymm11, 320(%r13, %r15, 1)
+	vmovapd			%ymm10, 320(%r13, %r15, 2)
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x7, %ymm15, %ymm2, %ymm2
+
+	movq	ARG2, %r11 // D
+	//
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	48(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	80(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11, %r14, 2), %ymm14
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+	// twelveth
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 40(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	addq	$256, %r11
+	vmovsd			120(%r11, %r14, 2), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11, %r14, 2) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 88(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 376(%r13, %r15, 2) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	vmovapd			96(%r11, %r14, 1), %ymm1
+	vmovapd			96(%r11, %r14, 2), %ymm2
+	movq	ARG1, %r10 // n
+	subl	$12, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11, %r14, 2), %ymm8
+	vbroadcastsd	56(%r11, %r14, 2), %ymm9
+	vbroadcastsd	88(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11, %r14, 2), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 24(%r11, %r14, 2)
+	vmovsd			%xmm9, 56(%r11, %r14, 2)
+	vmovsd			%xmm10, 88(%r11, %r14, 2)
+	vmovsd			%xmm11, 120(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11, %r14, 2), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 24(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	//
+//	vpermpd			$0x00, %ymm0, %ymm13
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	//
+	vpermpd			$0x55, %ymm0, %ymm13
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0xff, %ymm0, %ymm13
+	vmovapd			96(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	//
+	vpermpd			$0x00, %ymm1, %ymm13
+	vmovapd			128(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			128(%r13, %r15, 1), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x55, %ymm1, %ymm13
+	vmovapd			160(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			160(%r13, %r15, 1), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vmovapd			192(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			192(%r13, %r15, 1), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0xff, %ymm1, %ymm13
+	vmovapd			224(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			224(%r13, %r15, 1), %ymm14
+//	vblendpd		$0xf, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	//
+	vpermpd			$0x00, %ymm2, %ymm13
+	vmovapd			256(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			256(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			256(%r13, %r15, 2), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm13, %ymm10
+	//
+	vpermpd			$0x55, %ymm2, %ymm13
+	vmovapd			288(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			288(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			288(%r13, %r15, 2), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm10
+	//
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vmovapd			320(%r13), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vmovapd			320(%r13, %r15, 1), %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm11
+	vmovapd			320(%r13, %r15, 2), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vfmadd231pd		%ymm14, %ymm13, %ymm10
+	//
+	vbroadcastsd	376(%r13, %r15, 2), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm11, %ymm11
+	vmulpd			%ymm14, %ymm10, %ymm10
+//	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			352(%r13, %r15, 2), %ymm0
+	vblendpd		$0x7, %ymm10, %ymm0, %ymm10
+	vmovapd			%ymm15, 352(%r13)
+	vmovapd			%ymm11, 352(%r13, %r15, 1)
+	vmovapd			%ymm10, 352(%r13, %r15, 2)
+
+102:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgelqf_dlarft12_12_lib4, .-kernel_dgelqf_dlarft12_12_lib4
+#endif
+
+
+
+
+
+//                                   1      2           3        4           5
+// void kernel_dgelqf_dlarft4_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgelqf_dlarft4_12_lib4
+	.type kernel_dgelqf_dlarft4_12_lib4, @function
+kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgelqf_dlarft4_12_lib4
+_kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgelqf_dlarft4_12_lib4
+	.def kernel_dgelqf_dlarft4_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero T
+
+	movq	ARG5, %r10 // T
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm15, 0(%r10)
+	vmovapd			%ymm15, 32(%r10)
+	vmovapd			%ymm15, 64(%r10)
+	vmovapd			%ymm15, 96(%r10)
+
+	// first column
+
+	movq	ARG2, %r11 // D
+	movq	ARG3, %r14 // sdd
+	sall	$5, %r14d
+	movq	ARG4, %r12 // dD
+	movq	ARG5, %r13 // T
+
+	vxorpd			%xmm15, %xmm15, %xmm15
+	movq	ARG1, %r10 // n
+	subl	$1, %r10d
+	addq	$32, %r11
+100:
+	vmovsd			0(%r11), %xmm14
+	vfmadd231sd		%xmm14, %xmm14, %xmm15
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		100b
+
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		101f
+	vmovsd			%xmm14, 0(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			0(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 0(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 0(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r14, 1), %ymm1
+	vmovapd			0(%r11, %r14, 2), %ymm2
+	vbroadcastsd	32(%r11), %ymm8
+	vbroadcastsd	64(%r11), %ymm9
+	vbroadcastsd	96(%r11), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm8, 32(%r11)
+	vmovsd			%xmm9, 64(%r11)
+	vmovsd			%xmm10, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11), %ymm8
+	vbroadcastsd	32(%r11), %ymm9
+	vbroadcastsd	64(%r11), %ymm10
+	vbroadcastsd	96(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 0(%r11)
+	vmovsd			%xmm9, 32(%r11)
+	vmovsd			%xmm10, 64(%r11)
+	vmovsd			%xmm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm15, %ymm2, %ymm2
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// second column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 8(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			40(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 8(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 40(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vmovapd			32(%r11, %r14, 1), %ymm1
+	vmovapd			32(%r11, %r14, 2), %ymm2
+	vbroadcastsd	72(%r11), %ymm9
+	vbroadcastsd	104(%r11), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm9, 72(%r11)
+	vmovsd			%xmm10, 104(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11), %ymm8
+	vbroadcastsd	40(%r11), %ymm9
+	vbroadcastsd	72(%r11), %ymm10
+	vbroadcastsd	104(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 8(%r11)
+	vmovsd			%xmm9, 40(%r11)
+	vmovsd			%xmm10, 72(%r11)
+	vmovsd			%xmm11, 104(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 8(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC02(%rip), %ymm12
+#else
+	vmovapd			LC02(%rip), %ymm12
+#endif
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmulpd			%ymm15, %ymm2, %ymm2
+	vmovsd			%xmm0, 32(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x3, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	40(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// third column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 16(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			80(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 16(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 80(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vmovapd			64(%r11, %r14, 1), %ymm1
+	vmovapd			64(%r11, %r14, 2), %ymm2
+	vbroadcastsd	112(%r11), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm10, %ymm2
+	vmovsd			%xmm10, 112(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11), %ymm8
+	vbroadcastsd	48(%r11), %ymm9
+	vbroadcastsd	80(%r11), %ymm10
+	vbroadcastsd	112(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 16(%r11)
+	vmovsd			%xmm9, 48(%r11)
+	vmovsd			%xmm10, 80(%r11)
+	vmovsd			%xmm11, 112(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 16(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vbroadcastsd	80(%r13), %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			%xmm0, 64(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x7, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	48(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	80(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 24(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			120(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 24(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 120(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	vmovapd			96(%r11, %r14, 1), %ymm1
+	vmovapd			96(%r11, %r14, 2), %ymm2
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11), %ymm8
+	vbroadcastsd	56(%r11), %ymm9
+	vbroadcastsd	88(%r11), %ymm10
+	vbroadcastsd	120(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		32(%r11, %r14, 2), %ymm9, %ymm2
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		64(%r11, %r14, 2), %ymm10, %ymm2
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vfmadd231pd		96(%r11, %r14, 2), %ymm11, %ymm2
+	vmovsd			%xmm8, 24(%r11)
+	vmovsd			%xmm9, 56(%r11)
+	vmovsd			%xmm10, 88(%r11)
+	vmovsd			%xmm11, 120(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		0(%r11, %r14, 2), %ymm8, %ymm2
+	vmovsd			%xmm8, 24(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+
+	vbroadcastsd	120(%r13), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmulpd			%ymm14, %ymm2, %ymm2
+	vmovapd			96(%r13), %ymm0
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vmovapd			%ymm0, 96(%r13)
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vaddpd			%ymm2, %ymm10, %ymm10
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vmovapd			32(%r11, %r14, 2), %ymm10
+	vbroadcastsd	56(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	vmovapd			%ymm10, 32(%r11, %r14, 2)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vmovapd			64(%r11, %r14, 2), %ymm10
+	vbroadcastsd	88(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	vmovapd			%ymm10, 64(%r11, %r14, 2)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vmovapd			96(%r11, %r14, 2), %ymm10
+	vbroadcastsd	120(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	vmovapd			%ymm10, 96(%r11, %r14, 2)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vmovapd			0(%r11, %r14, 2), %ymm10
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm2, %ymm14, %ymm10
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	vmovapd			%ymm10, 0(%r11, %r14, 2)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+
+102:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgelqf_dlarft4_12_lib4, .-kernel_dgelqf_dlarft4_12_lib4
+#endif
+
+
+
+
+
+//                                  1      2           3        4           5
+// void kernel_dgelqf_dlarft4_8_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgelqf_dlarft4_8_lib4
+	.type kernel_dgelqf_dlarft4_8_lib4, @function
+kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgelqf_dlarft4_8_lib4
+_kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgelqf_dlarft4_8_lib4
+	.def kernel_dgelqf_dlarft4_8_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero T
+
+	movq	ARG5, %r10 // T
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm15, 0(%r10)
+	vmovapd			%ymm15, 32(%r10)
+	vmovapd			%ymm15, 64(%r10)
+	vmovapd			%ymm15, 96(%r10)
+
+	// first column
+
+	movq	ARG2, %r11 // D
+	movq	ARG3, %r14 // sdd
+	sall	$5, %r14d
+	movq	ARG4, %r12 // dD
+	movq	ARG5, %r13 // T
+
+	vxorpd			%xmm15, %xmm15, %xmm15
+	movq	ARG1, %r10 // n
+	subl	$1, %r10d
+	addq	$32, %r11
+100:
+	vmovsd			0(%r11), %xmm14
+	vfmadd231sd		%xmm14, %xmm14, %xmm15
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		100b
+
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		101f
+	vmovsd			%xmm14, 0(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			0(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 0(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 0(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r14, 1), %ymm1
+	vbroadcastsd	32(%r11), %ymm8
+	vbroadcastsd	64(%r11), %ymm9
+	vbroadcastsd	96(%r11), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vmovsd			%xmm8, 32(%r11)
+	vmovsd			%xmm9, 64(%r11)
+	vmovsd			%xmm10, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11), %ymm8
+	vbroadcastsd	32(%r11), %ymm9
+	vbroadcastsd	64(%r11), %ymm10
+	vbroadcastsd	96(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vmovsd			%xmm8, 0(%r11)
+	vmovsd			%xmm9, 32(%r11)
+	vmovsd			%xmm10, 64(%r11)
+	vmovsd			%xmm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vmovsd			%xmm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// second column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 8(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			40(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 8(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 40(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vmovapd			32(%r11, %r14, 1), %ymm1
+	vbroadcastsd	72(%r11), %ymm9
+	vbroadcastsd	104(%r11), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vmovsd			%xmm9, 72(%r11)
+	vmovsd			%xmm10, 104(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11), %ymm8
+	vbroadcastsd	40(%r11), %ymm9
+	vbroadcastsd	72(%r11), %ymm10
+	vbroadcastsd	104(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vmovsd			%xmm8, 8(%r11)
+	vmovsd			%xmm9, 40(%r11)
+	vmovsd			%xmm10, 72(%r11)
+	vmovsd			%xmm11, 104(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vmovsd			%xmm8, 8(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC02(%rip), %ymm12
+#else
+	vmovapd			LC02(%rip), %ymm12
+#endif
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmovsd			%xmm0, 32(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x3, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmulpd			%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	40(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// third column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 16(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			80(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 16(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 80(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vmovapd			64(%r11, %r14, 1), %ymm1
+	vbroadcastsd	112(%r11), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm10, %ymm1
+	vmovsd			%xmm10, 112(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11), %ymm8
+	vbroadcastsd	48(%r11), %ymm9
+	vbroadcastsd	80(%r11), %ymm10
+	vbroadcastsd	112(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vmovsd			%xmm8, 16(%r11)
+	vmovsd			%xmm9, 48(%r11)
+	vmovsd			%xmm10, 80(%r11)
+	vmovsd			%xmm11, 112(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vmovsd			%xmm8, 16(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vbroadcastsd	80(%r13), %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmovapd			%xmm0, 64(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x7, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	//
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	48(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 32(%r11)
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	80(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 64(%r11)
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 96(%r11)
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 24(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			120(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 24(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 120(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	vmovapd			96(%r11, %r14, 1), %ymm1
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11), %ymm8
+	vbroadcastsd	56(%r11), %ymm9
+	vbroadcastsd	88(%r11), %ymm10
+	vbroadcastsd	120(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		32(%r11, %r14, 1), %ymm9, %ymm1
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		64(%r11, %r14, 1), %ymm10, %ymm1
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vfmadd231pd		96(%r11, %r14, 1), %ymm11, %ymm1
+	vmovsd			%xmm8, 24(%r11)
+	vmovsd			%xmm9, 56(%r11)
+	vmovsd			%xmm10, 88(%r11)
+	vmovsd			%xmm11, 120(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		0(%r11, %r14, 1), %ymm8, %ymm1
+	vmovsd			%xmm8, 24(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm15
+
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm15
+
+	vbroadcastsd	120(%r13), %ymm14
+	vmulpd			%ymm14, %ymm15, %ymm15
+	vmulpd			%ymm14, %ymm1, %ymm1
+	vmovapd			96(%r13), %ymm0
+	vblendpd		$0x7, %ymm15, %ymm0, %ymm0
+	vmovapd			%ymm0, 96(%r13)
+
+	movq	ARG2, %r11 // D
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vaddpd			%ymm1, %ymm9, %ymm9
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	//
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	//
+	vmovapd			32(%r11, %r14, 1), %ymm9
+	vbroadcastsd	56(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm9, 32(%r11, %r14, 1)
+	//
+	vmovapd			64(%r11, %r14, 1), %ymm9
+	vbroadcastsd	88(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm9, 64(%r11, %r14, 1)
+	//
+	vmovapd			96(%r11, %r14, 1), %ymm9
+	vbroadcastsd	120(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm9, 96(%r11, %r14, 1)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11, %r14, 1), %ymm9
+	vbroadcastsd	24(%r11), %ymm14
+	vfmadd231pd		%ymm1, %ymm14, %ymm9
+	vmovapd			%ymm9, 0(%r11, %r14, 1)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+
+102:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgelqf_dlarft4_8_lib4, .-kernel_dgelqf_dlarft4_8_lib4
+#endif
+
+
+
+
+
+//                                  1      2           3           4
+// void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT, double *beta)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgelqf_dlarft4_4_lib4
+	.type kernel_dgelqf_dlarft4_4_lib4, @function
+kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgelqf_dlarft4_4_lib4
+_kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgelqf_dlarft4_4_lib4
+	.def kernel_dgelqf_dlarft4_4_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero T
+
+	movq	ARG4, %r10 // T
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm15, 0(%r10)
+	vmovapd			%ymm15, 32(%r10)
+	vmovapd			%ymm15, 64(%r10)
+	vmovapd			%ymm15, 96(%r10)
+
+	// first column
+
+	movq	ARG2, %r11 // D
+	movq	ARG3, %r12 // dD
+	movq	ARG4, %r13 // T
+
+	vxorpd			%xmm15, %xmm15, %xmm15
+	movq	ARG1, %r10 // n
+	subl	$1, %r10d
+	addq	$32, %r11
+100:
+	vmovsd			0(%r11), %xmm14
+	vfmadd231sd		%xmm14, %xmm14, %xmm15
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		100b
+
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		101f
+	vmovsd			%xmm14, 0(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			0(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 0(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 0(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 0(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			0(%r11), %ymm0
+	vbroadcastsd	32(%r11), %ymm8
+	vbroadcastsd	64(%r11), %ymm9
+	vbroadcastsd	96(%r11), %ymm10
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		32(%r11), %ymm8, %ymm0
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vmovsd			%xmm8, 32(%r11)
+	vmovsd			%xmm9, 64(%r11)
+	vmovsd			%xmm10, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	0(%r11), %ymm8
+	vbroadcastsd	32(%r11), %ymm9
+	vbroadcastsd	64(%r11), %ymm10
+	vbroadcastsd	96(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vmovsd			%xmm8, 0(%r11)
+	vmovsd			%xmm9, 32(%r11)
+	vmovsd			%xmm10, 64(%r11)
+	vmovsd			%xmm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	0(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vmovsd			%xmm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vbroadcastsd	0(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x1, %ymm15, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	vmovapd			0(%r11), %ymm8
+	vmovapd			32(%r11), %ymm9
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vaddpd			%ymm0, %ymm8, %ymm8
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm9
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vmulpd			%ymm10, %ymm10, %ymm15
+	vfmadd231pd		%ymm11, %ymm11, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 32(%r11)
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			32(%r11), %ymm9
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vbroadcastsd	32(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm9
+	vbroadcastsd	64(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm10
+	vbroadcastsd	96(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vfmadd231pd		%ymm11, %ymm11, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 32(%r11)
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0x55, %ymm15, %ymm15  // beta
+
+	// second column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 8(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			40(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 40(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 8(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 40(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			32(%r11), %ymm0
+	vbroadcastsd	72(%r11), %ymm9
+	vbroadcastsd	104(%r11), %ymm10
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		64(%r11), %ymm9, %ymm0
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vmovsd			%xmm9, 72(%r11)
+	vmovsd			%xmm10, 104(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	8(%r11), %ymm8
+	vbroadcastsd	40(%r11), %ymm9
+	vbroadcastsd	72(%r11), %ymm10
+	vbroadcastsd	104(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vmovsd			%xmm8, 8(%r11)
+	vmovsd			%xmm9, 40(%r11)
+	vmovsd			%xmm10, 72(%r11)
+	vmovsd			%xmm11, 104(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	8(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vmovsd			%xmm8, 8(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC02(%rip), %ymm12
+#else
+	vmovapd			LC02(%rip), %ymm12
+#endif
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm0
+	vbroadcastsd	40(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmovsd			%xmm0, 32(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x3, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	vmovapd			32(%r11), %ymm9
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vaddpd			%ymm0, %ymm9, %ymm9
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vmulpd			%ymm11, %ymm11, %ymm15
+	vmovapd			%ymm9, 32(%r11)
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			32(%r11), %ymm9
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vbroadcastsd	40(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm9
+	vbroadcastsd	72(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm10
+	vbroadcastsd	104(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vfmadd231pd		%ymm11, %ymm11, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 32(%r11)
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	8(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xaa, %ymm15, %ymm15  // beta
+
+	// third column
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 16(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			80(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 80(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 16(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 80(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			64(%r11), %ymm0
+	vbroadcastsd	112(%r11), %ymm10
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vfmadd231pd		96(%r11), %ymm10, %ymm0
+	vmovsd			%xmm10, 112(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	16(%r11), %ymm8
+	vbroadcastsd	48(%r11), %ymm9
+	vbroadcastsd	80(%r11), %ymm10
+	vbroadcastsd	112(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vmovsd			%xmm8, 16(%r11)
+	vmovsd			%xmm9, 48(%r11)
+	vmovsd			%xmm10, 80(%r11)
+	vmovsd			%xmm11, 112(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	16(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vmovsd			%xmm8, 16(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm1
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm1
+	vblendpd		$0x7, %ymm1, %ymm0, %ymm0
+	vbroadcastsd	80(%r13), %ymm15
+	vmulpd			%ymm15, %ymm0, %ymm0
+	vmovapd			%xmm0, 64(%r13)
+
+	vxorpd			%ymm12, %ymm12, %ymm12
+	vblendpd		$0x7, %ymm12, %ymm0, %ymm0
+
+	movq	ARG2, %r11 // D
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vaddpd			%ymm0, %ymm10, %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		110f
+106:
+	vmovapd			0(%r11), %ymm8
+	vmovapd			32(%r11), %ymm9
+	vmovapd			64(%r11), %ymm10
+	vmovapd			96(%r11), %ymm11
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vbroadcastsd	48(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm9
+	vbroadcastsd	80(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm10
+	vbroadcastsd	112(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm11
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vfmadd231pd		%ymm9, %ymm9, %ymm15
+	vfmadd231pd		%ymm10, %ymm10, %ymm15
+	vfmadd231pd		%ymm11, %ymm11, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	vmovapd			%ymm9, 32(%r11)
+	vmovapd			%ymm10, 64(%r11)
+	vmovapd			%ymm11, 96(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		106b
+110:
+	cmpl	$0, %r10d
+	jle		107f
+108:
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	16(%r11), %ymm14
+	vfmadd231pd		%ymm0, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm8, %ymm15
+	vmovapd			%ymm8, 0(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		108b
+107:
+	vpermpd	$0xff, %ymm15, %ymm15  // beta
+
+102:
+	vxorpd			%xmm14, %xmm14, %xmm14
+	vucomisd		%xmm14, %xmm15
+	jne		101f
+//	jp		111f
+	vmovsd			%xmm14, 24(%r12)
+	jmp		102f
+
+101:
+	movq	ARG2, %r11 // D
+	vmovsd			120(%r11), %xmm14 // alpha
+	vfmadd231sd		%xmm14, %xmm14, %xmm15 // beta
+	vsqrtsd			%xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC00(%rip), %xmm13 // mask
+#else
+	vmovsd			LC00(%rip), %xmm13 // mask
+#endif
+	vandpd			%xmm13, %xmm14, %xmm12
+	vxorpd			%xmm13, %xmm12, %xmm12
+	vxorpd			%xmm12, %xmm15, %xmm15 // beta
+	vmovsd			%xmm15, 120(%r11) // pD[0+ps*0]
+	vsubsd			%xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd			.LC01(%rip), %xmm12
+#else
+	vmovapd			LC01(%rip), %xmm12
+#endif
+	vmovsd			%xmm14, %xmm12, %xmm12
+	vmovddup		%xmm14, %xmm14
+	vmovsd			%xmm15, %xmm14, %xmm14
+	vdivpd			%xmm14, %xmm12, %xmm14
+	vmovsd			%xmm14, 24(%r12) // dD[0]
+	vxorpd			%xmm13, %xmm14, %xmm12
+	vmovsd			%xmm12, 120(%r13) // pT[0+ps*0]
+
+	vpermpd			$0x55, %ymm14, %ymm15 // tmp
+
+	vmovapd			96(%r11), %ymm0
+	movq	ARG1, %r10 // n
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jle		109f
+103:
+	vbroadcastsd	24(%r11), %ymm8
+	vbroadcastsd	56(%r11), %ymm9
+	vbroadcastsd	88(%r11), %ymm10
+	vbroadcastsd	120(%r11), %ymm11
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vmulpd			%ymm15, %ymm9, %ymm9
+	vmulpd			%ymm15, %ymm10, %ymm10
+	vmulpd			%ymm15, %ymm11, %ymm11
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vfmadd231pd		32(%r11), %ymm9, %ymm0
+	vfmadd231pd		64(%r11), %ymm10, %ymm0
+	vfmadd231pd		96(%r11), %ymm11, %ymm0
+	vmovsd			%xmm8, 24(%r11)
+	vmovsd			%xmm9, 56(%r11)
+	vmovsd			%xmm10, 88(%r11)
+	vmovsd			%xmm11, 120(%r11)
+	subl	$4, %r10d
+	addq	$128, %r11
+	cmpl	$3, %r10d
+	jg		103b
+109:
+	cmpl	$0, %r10d
+	jle		104f
+105:
+	vbroadcastsd	24(%r11), %ymm8
+	vmulpd			%ymm15, %ymm8, %ymm8
+	vfmadd231pd		0(%r11), %ymm8, %ymm0
+	vmovsd			%xmm8, 24(%r11)
+	subl	$1, %r10d
+	addq	$32, %r11
+	cmpl	$0, %r10d
+	jg		105b
+104:
+
+	vxorpd			%xmm12, %xmm12, %xmm12
+
+	vmovapd			0(%r13), %ymm14
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm14
+	vmulpd			%ymm14, %ymm0, %ymm1
+
+	vmovapd			32(%r13), %ymm14
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm1
+
+	vmovapd			64(%r13), %ymm14
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfmadd231pd		%ymm14, %ymm13, %ymm1
+
+	vbroadcastsd	120(%r13), %ymm15
+	vmulpd			%ymm15, %ymm1, %ymm1
+	vmovapd			96(%r13), %ymm0
+	vblendpd		$0x7, %ymm1, %ymm0, %ymm0
+	vmovapd			%ymm0, 96(%r13)
+
+102:
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgelqf_dlarft4_4_lib4, .-kernel_dgelqf_dlarft4_4_lib4
+#endif
+
+
+
+
+
+//                                      1           2
+// void kernel_dgelqf_dlarft_12_12_lib4(double *dK, double *pT)
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb_12_lib4
+	.type kernel_dlarfb_12_lib4, @function
+kernel_dlarfb_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb_12_lib4
+_kernel_dlarfb_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb_12_lib4
+	.def kernel_dlarfb_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	movq	ARG1, %r10 // K
+	movq	ARG2, %r11 // T
+	movq	$384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+	//
+	vmovapd			352(%r10), %ymm12
+	vbroadcastsd	376(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm11
+	//
+	vmovapd			320(%r10), %ymm12
+	vbroadcastsd	368(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	336(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm10
+	//
+	vmovapd			288(%r10), %ymm12
+	vbroadcastsd	360(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	328(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	296(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm9
+	//
+	vmovapd			256(%r10), %ymm12
+	vbroadcastsd	352(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	320(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	288(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	256(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm8
+	//
+	vmovapd			224(%r10), %ymm12
+	vbroadcastsd	376(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	344(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	312(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	280(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	248(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm7
+	//
+	vmovapd			192(%r10), %ymm12
+	vbroadcastsd	368(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	336(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	304(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	272(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	240(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	208(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm6
+	//
+	vmovapd			160(%r10), %ymm12
+	vbroadcastsd	360(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	328(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	296(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	264(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	232(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	200(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	168(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm5
+	//
+	vmovapd			128(%r10), %ymm12
+	vbroadcastsd	352(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	320(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	288(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	256(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	224(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	192(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	160(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	128(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm4
+	//
+	vmovapd			96(%r10), %ymm12
+	vbroadcastsd	376(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	344(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	312(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	280(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	248(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	216(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	184(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	152(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	120(%r11), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm3
+	//
+	vmovapd			64(%r10), %ymm12
+	vbroadcastsd	368(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	336(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	304(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	272(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	240(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	208(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	176(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	144(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	112(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm2
+	//
+	vmovapd			32(%r10), %ymm12
+	vbroadcastsd	360(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	328(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	296(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	264(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	232(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	200(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	168(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	136(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	104(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm1
+	//
+	vmovapd			0(%r10), %ymm12
+	vbroadcastsd	352(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	vbroadcastsd	320(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	288(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	256(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	224(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	192(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	160(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	128(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	96(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm12, %ymm13, %ymm0
+
+	vmovapd			%ymm11, 352(%r10)
+	vmovapd			%ymm10, 320(%r10)
+	vmovapd			%ymm9, 288(%r10)
+	vmovapd			%ymm8, 256(%r10)
+	vmovapd			%ymm7, 224(%r10)
+	vmovapd			%ymm6, 192(%r10)
+	vmovapd			%ymm5, 160(%r10)
+	vmovapd			%ymm4, 128(%r10)
+	vmovapd			%ymm3, 96(%r10)
+	vmovapd			%ymm2, 64(%r10)
+	vmovapd			%ymm1, 32(%r10)
+	vmovapd			%ymm0, 0(%r10)
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb_12_lib4, .-kernel_dlarfb_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 100...0 100...0 100...0 100...0 }
+#elif defined(OS_MAC)
+LC00: // { 100...0 100...0 100...0 100...0 }
+	.align 5
+#endif
+	.long	0x00000000
+	.long	0x80000000
+	.long	0x00000000
+	.long	0x80000000
+	.long	0x00000000
+	.long	0x80000000
+	.long	0x00000000
+	.long	0x80000000
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+	.align 5
+#endif
+	.double	-1.0
+	.double	-1.0
+	.double	-1.0
+	.double	-1.0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+	.align 5
+#endif
+	.double	1.0
+	.double	1.0
+	.double	1.0
+	.double	1.0
+
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
new file mode 100644
index 0000000..05c2d2e
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
@@ -0,0 +1,282 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	return;
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = -1.0 / (beta-alpha);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	w3 = - dD[2] * w3;
+//printf("\n%f %f %f\n", pT[0+ps*2], pT[1+ps*2], w3);
+//return;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	return;
+	}
+
+
+
+
diff --git a/kernel/avx2/kernel_dgemm_12x4_lib4.S b/kernel/avx2/kernel_dgemm_12x4_lib4.S
new file mode 100644
index 0000000..766cb92
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_12x4_lib4.S
@@ -0,0 +1,15536 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_12x4_lib4, @function
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm13 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	$128, %r13
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	$128, %r13
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm13 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 		0(%r11, %r12, 2), %ymm15 // A2[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	addq	$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	subl	$1, %r10d
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm12 // A0[0]
+	vmovapd 0(%r13), %ymm15 // B[0]
+	vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+	vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	subl		$4, %r10d
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd		$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd		$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd		32(%r11), %ymm12 // A0[4]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd		32(%r11, %r12, 1), %ymm13 // A1[4]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		32(%r13), %ymm15 // B[4]
+	vmovapd		32(%r11, %r12, 2), %ymm14 // A2[4]
+
+	// unroll 1
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd 	64(%r11), %ymm12 // A0[8]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd 	64(%r11, %r12, 1), %ymm13 // A1[8]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd 	64(%r13), %ymm15 // B[8]
+	vmovapd 	64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+	// unroll 2
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd 	96(%r11), %ymm12 // A0[12]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd 	96(%r11, %r12, 1), %ymm13 // A1[12]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd 	96(%r13), %ymm15 // B[12]
+	vmovapd 	96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+	// unroll 3
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	addq		$128, %r11
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	addq		$128, %r13
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd 	0(%r11), %ymm12 // A0[0]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd 	0(%r11, %r12, 1), %ymm13 // A1[0]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd 	0(%r13), %ymm15 // B[0]
+	vmovapd 	0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	subl		$4, %r10d
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd		$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd		$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd		32(%r11), %ymm12 // A0[4]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd		32(%r11, %r12, 1), %ymm13 // A1[4]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		32(%r13), %ymm15 // B[4]
+	vmovapd		32(%r11, %r12, 2), %ymm14 // A2[4]
+
+	// unroll 1
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd 	64(%r11), %ymm12 // A0[8]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd 	64(%r11, %r12, 1), %ymm13 // A1[8]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd 	64(%r13), %ymm15 // B[8]
+	vmovapd 	64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+	// unroll 2
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd 	96(%r11), %ymm12 // A0[12]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vmovapd 	96(%r11, %r12, 1), %ymm13 // A1[12]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd 	96(%r13), %ymm15 // B[12]
+	vmovapd 	96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+	// unroll 3
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	addq		$128, %r11
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	addq		$128, %r13
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+//	cmpl		$4, %r10d
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+//	vmovapd 	0(%r11), %ymm12 // A0[0]
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+//	vmovapd 	0(%r11, %r12, 1), %ymm13 // A1[0]
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+//	vmovapd 	0(%r13), %ymm15 // B[0]
+//	vmovapd 	0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 	0(%r11), %ymm12 // A0[4]
+	vmovapd 	0(%r11, %r12, 1), %ymm13 // A1[4]
+	vmovapd 	0(%r13), %ymm15 // B[4]
+	vmovapd 	0(%r11, %r12, 2), %ymm14 // A2[4]
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	addq		$32, %r11
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+	addq		$32, %r13
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+
+	vperm2f128 	$0x1, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	subl		$1, %r10d
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+	vshufpd 	$0x5, %ymm15, %ymm15, %ymm15
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+
+
+	cmpl		$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_12x4_lib4, .-inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_12x4_lib4, @function
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm13 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	$128, %r13
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	$128, %r13
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm13 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 		0(%r11, %r12, 2), %ymm15 // A2[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	addq	$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+	subl	$1, %r10d
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_12x4_lib4, .-inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_12x4_lib4, @function
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm13 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	 0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	%r14, %r13
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	%r14, %r13
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A0
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm13 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 		0(%r11, %r12, 2), %ymm15 // A2[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_12x4_lib4, .-inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_12x4_lib4, @function
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm13 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	 0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	%r14, %r13
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	subl	$4, %r10d
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vmovapd			96(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vmovapd			96(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	vmovapd			96(%r11, %r12, 2), %ymm15 // A1
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	120(%r13), %ymm12
+	addq	%r14, %r13
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A1
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A1
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm13 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm14 // A1[0]
+	vmovapd 		0(%r11, %r12, 2), %ymm15 // A2[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vfnmadd231pd	%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vfnmadd231pd	%ymm15, %ymm12, %ymm9
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vfnmadd231pd	%ymm15, %ymm12, %ymm10
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	vfnmadd231pd	%ymm15, %ymm12, %ymm11
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_12x4_lib4, .-inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x12_lib4, @function
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+	prefetcht0	128(%r12, %r13, 2) // software prefetch
+	prefetcht0	192(%r12, %r13, 2) // software prefetch
+	prefetcht0	256(%r12, %r13, 2) // software prefetch
+	prefetcht0	320(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	vbroadcastsd	256(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm8
+	vbroadcastsd	288(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm9
+	vbroadcastsd	320(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm10
+	vbroadcastsd	352(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm11
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vbroadcastsd	264(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm8
+	vbroadcastsd	296(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm9
+	vbroadcastsd	328(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm10
+	vbroadcastsd	360(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm11
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	vbroadcastsd	272(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm8
+	vbroadcastsd	304(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm9
+	vbroadcastsd	336(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm10
+	vbroadcastsd	368(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm11
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vbroadcastsd	280(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm8
+	vbroadcastsd	312(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm9
+	vbroadcastsd	344(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm10
+	vbroadcastsd	376(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm11
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	vbroadcastsd	256(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm8
+	vbroadcastsd	288(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm9
+	vbroadcastsd	320(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm10
+	vbroadcastsd	352(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm11
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vbroadcastsd	264(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm8
+	vbroadcastsd	296(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm9
+	vbroadcastsd	328(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm10
+	vbroadcastsd	360(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm11
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	vbroadcastsd	272(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm8
+	vbroadcastsd	304(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm9
+	vbroadcastsd	336(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm10
+	vbroadcastsd	368(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm11
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vbroadcastsd	280(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm8
+	vbroadcastsd	312(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm9
+	vbroadcastsd	344(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm10
+	vbroadcastsd	376(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm11
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	vbroadcastsd	256(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm8
+	vbroadcastsd	288(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm9
+	vbroadcastsd	320(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm10
+	vbroadcastsd	352(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm11
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x12_lib4, .-inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+// ymm4  <-
+// ymm5  <-
+// ymm6  <-
+// ymm7  <-
+// ymm8  <-
+// ymm9  <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+// ymm4  <-
+// ymm5  <-
+// ymm6  <-
+// ymm7  <-
+// ymm8  <-
+// ymm9  <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_12x4_lib4, @function
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vmovapd			0(%r12, %r13, 2), %ymm15
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vfmadd231pd		%ymm11, %ymm13, %ymm15
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	vmovapd			%ymm15, 0(%r12, %r13, 2)
+
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vmovapd			32(%r12, %r13, 2), %ymm15
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	40(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	48(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vbroadcastsd	56(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vfmadd231pd		%ymm11, %ymm13, %ymm15
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+	vmovapd			%ymm15, 32(%r12, %r13, 2)
+
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vmovapd			64(%r12, %r13, 2), %ymm15
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	80(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vbroadcastsd	88(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vfmadd231pd		%ymm11, %ymm13, %ymm15
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+	vmovapd			%ymm15, 64(%r12, %r13, 2)
+
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vmovapd			96(%r12, %r13, 2), %ymm15
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	-24(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	-16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vfmadd231pd		%ymm11, %ymm13, %ymm15
+	vmovapd			%ymm12, -32(%r12)
+	vmovapd			%ymm14, -32(%r12, %r13, 1)
+	vmovapd			%ymm15, -32(%r12, %r13, 2)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vmovapd			0(%r12, %r13, 2), %ymm15
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	8(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vfmadd231pd		%ymm11, %ymm13, %ymm15
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	vmovapd			%ymm15, 0(%r12, %r13, 2)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_12x4_lib4, .-inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 32*sdb
+// r14   <- C
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm4  <-
+// ymm5  <-
+// ymm6  <-
+// ymm7  <-
+// ymm8  <-
+// ymm9  <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A
+// r12   <- B+?
+// r13   <- 32*sdb
+// r14   <- C+?
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm4  <-
+// ymm5  <-
+// ymm6  <-
+// ymm7  <-
+// ymm8  <-
+// ymm9  <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_4x12_lib4, @function
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$11, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+	vmovapd	128(%r14), %ymm4
+	vmovapd	160(%r14), %ymm5
+	vmovapd	192(%r14), %ymm6
+	vmovapd	224(%r14), %ymm7
+	vmovapd	256(%r14), %ymm8
+	vmovapd	288(%r14), %ymm9
+	vmovapd	320(%r14), %ymm10
+	vmovapd	352(%r14), %ymm11
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	152(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+	vmovapd	%ymm4, 128(%r14)
+	vmovapd	%ymm5, 160(%r14)
+	vmovapd	%ymm6, 192(%r14)
+	vmovapd	%ymm7, 224(%r14)
+	vmovapd	%ymm8, 256(%r14)
+	vmovapd	%ymm9, 288(%r14)
+	vmovapd	%ymm10, 320(%r14)
+	vmovapd	%ymm11, 352(%r14)
+
+	addq	$384, %r12
+	addq	$384, %r14
+	subl	$12, %r10d
+
+	cmpl	$11, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$3, %r10d
+	jle		2f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	72(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	48(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	112(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	56(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	88(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+
+	addq	$128, %r12
+	addq	$128, %r14
+	subl	$4, %r10d
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+2:
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+1:
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vbroadcastsd	0(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vbroadcastsd	8(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vbroadcastsd	16(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vbroadcastsd	24(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+
+	addq	$32, %r12
+	addq	$32, %r14
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		1b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_4x12_lib4, .-inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_12x4_lib4, @function
+inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %ebx
+	subl			%r15d, %ebx // 4-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,4-offsetB)
+
+	movl			%r15d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r13 // B+offsetB*sizeof(double)
+
+	movq			%r11, %rax // A1 <- A0
+	addq			%r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+	movq			%rax, %rbp // A2 <- A1
+	addq			%r12, %rbp // A2 <- A1 + 4*sda*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12 // A0[0]
+	vmovapd			0(%rax), %ymm14 // A1[0]
+	vmovapd			0(%rbp), %ymm15 // A2[0]
+	vbroadcastsd	0(%r13), %ymm13 // B[0]
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vfmadd231pd		%ymm14, %ymm13, %ymm4
+	vfmadd231pd		%ymm15, %ymm13, %ymm8
+	vbroadcastsd	32(%r13), %ymm13 // B[1]
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vfmadd231pd		%ymm14, %ymm13, %ymm5
+	vfmadd231pd		%ymm15, %ymm13, %ymm9
+	vbroadcastsd	64(%r13), %ymm13 // B[2]
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vfmadd231pd		%ymm14, %ymm13, %ymm6
+	vfmadd231pd		%ymm15, %ymm13, %ymm10
+	vbroadcastsd	96(%r13), %ymm13 // B[3]
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vfmadd231pd		%ymm14, %ymm13, %ymm7
+	vfmadd231pd		%ymm15, %ymm13, %ymm11
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A0+1*bs*sizeof(float)
+	addq			$32, %rax // A1+1*bs*sizeof(float)
+	addq			$32, %rbp // A2+1*bs*sizeof(float)
+	addq			$8, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_12x4_lib4, .-inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x12_lib4, @function
+inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x12_lib4, .-inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_12x4_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // A1 <- A0
+	addq	%r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	movq	%r15, %r14 // A2 <- A1
+	addq	%r11, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+	vbroadcastsd	0(%r12), %ymm12
+	vmovapd			0(%r10), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			32(%r10), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			64(%r10), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			64(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	72(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vbroadcastsd	96(%r12), %ymm12
+	vmovapd			96(%r10), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			96(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			96(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	104(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	112(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	120(%r12), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	addq			$128, %r10
+	addq			$128, %r12
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_12x4_lib4, .-inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*4*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_12x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	movq	%r15, %r14 // A2 <- A1
+	addq	%r12, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+	vbroadcastsd	0(%r13), %ymm12
+	addq			$32, %r13
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm13
+	addq			$32, %r11
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm14
+	addq			$32, %r15
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r14), %ymm15
+	addq			$32, %r14
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	addq			$32, %r11
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	addq			$32, %r13
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	addq			$32, %r15
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	addq			$32, %r14
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	addq			$32, %r11
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	addq			$32, %r13
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	addq			$32, %r15
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	addq			$32, %r14
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm13
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm14
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r14), %ymm15
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	addq			$32, %r11
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	addq			$32, %r13
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq			$32, %r15
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+	addq			$32, %r14
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_12x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d40 d50 d60 d70]
+// ymm9  <- [d41 d51 d61 d71]
+// ymm10  <- [d42 d52 d62 d72]
+// ymm11 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_12x4_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r15d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vmovapd			32(%r11), %ymm13
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vmovapd			32(%r11, %r12, 2), %ymm15
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vmovapd			64(%r11), %ymm13
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vmovapd			64(%r11, %r12, 2), %ymm15
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vmovapd			96(%r11), %ymm13
+	vmovapd			96(%r11, %r12, 1), %ymm14
+	vmovapd			96(%r11, %r12, 2), %ymm15
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r15d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vmovapd			32(%r11), %ymm13
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vmovapd			32(%r11, %r12, 2), %ymm15
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vmovapd			64(%r11), %ymm13
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vmovapd			64(%r11, %r12, 2), %ymm15
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A0+3*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$8, %r13 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r15d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	vmovapd			32(%r11), %ymm13
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vmovapd			32(%r11, %r12, 2), %ymm15
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A0+2*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$16, %r13 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vmovapd			32(%r11), %ymm13
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vmovapd			32(%r11, %r12, 2), %ymm15
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	vmovapd			64(%r11), %ymm13
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vmovapd			64(%r11, %r12, 2), %ymm15
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	vmovapd			96(%r11), %ymm13
+	vmovapd			96(%r11, %r12, 1), %ymm14
+	vmovapd			96(%r11, %r12, 2), %ymm15
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	vmovapd			32(%r11), %ymm13
+	vmovapd			32(%r11, %r12, 1), %ymm14
+	vmovapd			32(%r11, %r12, 2), %ymm15
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	vmovapd			64(%r11), %ymm13
+	vmovapd			64(%r11, %r12, 1), %ymm14
+	vmovapd			64(%r11, %r12, 2), %ymm15
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	vmovapd			96(%r11), %ymm13
+	vmovapd			96(%r11, %r12, 1), %ymm14
+	vmovapd			96(%r11, %r12, 2), %ymm15
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_12x4_lib4, .-inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_12x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r15d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r15d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r15d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm13
+	vmovapd			0(%r11, %r12, 1), %ymm14
+	vmovapd			0(%r11, %r12, 2), %ymm15
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vfmadd231pd		%ymm15, %ymm12, %ymm8
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vfmadd231pd		%ymm15, %ymm12, %ymm9
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vfmadd231pd		%ymm15, %ymm12, %ymm10
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	vfmadd231pd		%ymm15, %ymm12, %ymm11
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_12x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_12x4_lib4, @function
+inner_blend_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_12x4_lib4:
+#endif
+#endif
+	
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm12
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm13
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm14
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm0
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm2
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm1
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm3
+
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm12
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm13
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm14
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm4
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm6
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm5
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm7
+
+
+	vblendpd	$0xa, %ymm9, %ymm8, %ymm12
+	vblendpd	$0x5, %ymm9, %ymm8, %ymm13
+	vblendpd	$0xa, %ymm11, %ymm10, %ymm14
+	vblendpd	$0x5, %ymm11, %ymm10, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm8
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm10
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm9
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_12x4_lib4, .-inner_blend_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_12x4_lib4, @function
+inner_tran_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_tran_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_12x4_lib4; .scl 2; .type 32; .endef
+inner_tran_12x4_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vunpcklpd	%ymm9, %ymm8, %ymm12
+	vunpckhpd	%ymm9, %ymm8, %ymm13
+	vunpcklpd	%ymm11, %ymm10, %ymm14
+	vunpckhpd	%ymm11, %ymm10, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm8
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm10
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm9
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_12x4_lib4, .-inner_tran_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_12x4_lib4, @function
+inner_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_12x4_lib4:
+#endif
+#endif
+	
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC05(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		0(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		32(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		64(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		96(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_12x4_lib4, .-inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_12x4_lib4, @function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_12x4_lib4:
+#endif
+#endif
+		
+	vbroadcastsd 0(%r10), %ymm15 // beta
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	movq	%r15, %r14 // C2 <- C1
+	addq	%r13, %r14 // C2 <- C1 + 4*sdc*sizeof(double)
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		0(%r14), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		32(%r14), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		64(%r14), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		96(%r14), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- &alpha
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_12x4_lib4, @function
+inner_scale_a0_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_12x4_lib4:
+#endif
+#endif
+		
+	vbroadcastsd 0(%r10), %ymm15 // beta
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_12x4_lib4, .-inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend and scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_12x4_lib4, @function
+inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_12x4_lib4:
+#endif
+#endif
+		
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm12
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm13
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm14
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm0
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm2
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm1
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm12
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm13
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm14
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm4
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm6
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm5
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vblendpd	$0xa, %ymm9, %ymm8, %ymm12
+	vblendpd	$0x5, %ymm9, %ymm8, %ymm13
+	vblendpd	$0xa, %ymm11, %ymm10, %ymm14
+	vblendpd	$0x5, %ymm11, %ymm10, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm8
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm10
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm9
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm11
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		0(%r12, %r13, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		32(%r12, %r13, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		64(%r12, %r13, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		96(%r12, %r13, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_12x4_lib4, .-inner_blend_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x12_lib4, @function
+inner_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x12_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+	vmovapd		256(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		288(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		320(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		352(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x12_lib4, .-inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x12_lib4, @function
+inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x12_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vunpcklpd	%ymm9, %ymm8, %ymm12
+	vunpckhpd	%ymm9, %ymm8, %ymm13
+	vunpcklpd	%ymm11, %ymm10, %ymm14
+	vunpckhpd	%ymm11, %ymm10, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm8
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm10
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm9
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm11
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		256(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		288(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		320(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		352(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x12_lib4, .-inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_12x4_lib4, @function
+inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_12x4_lib4:
+#endif
+#endif
+	
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm12
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm13
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm14
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm0
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm2
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm1
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm12
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm13
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm14
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm4
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm6
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm5
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm7
+
+	vblendpd	$0xa, %ymm9, %ymm8, %ymm12
+	vblendpd	$0x5, %ymm9, %ymm8, %ymm13
+	vblendpd	$0xa, %ymm11, %ymm10, %ymm14
+	vblendpd	$0x5, %ymm11, %ymm10, %ymm15
+
+	vblendpd	$0xc, %ymm14, %ymm12, %ymm8
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm10
+	vblendpd	$0xc, %ymm15, %ymm13, %ymm9
+	vblendpd	$0x3, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC05(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		0(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		32(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		64(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		96(%r10, %r11, 2), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_12x4_lib4, .-inner_blend_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_11_4x12_lib4, @function
+inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_11_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x12_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vunpcklpd	%ymm9, %ymm8, %ymm12
+	vunpckhpd	%ymm9, %ymm8, %ymm13
+	vunpcklpd	%ymm11, %ymm10, %ymm14
+	vunpckhpd	%ymm11, %ymm10, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm8
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm10
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm9
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC05(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		256(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		288(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		320(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		352(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_11_4x12_lib4, .-inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10   <- inv_diag_E
+// r11d  <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_12x4_vs_lib4, @function
+inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_12x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC05(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd			%xmm0, %xmm0, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+2:
+	vmovsd			%xmm13, 0(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$2, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x00, %ymm0, %ymm0, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+4:
+	vmovsd			%xmm13, 8(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$3, %r11d
+	jl				0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+6:
+	vmovsd			%xmm13, 16(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$4, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x11, %ymm2, %ymm2, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+//	vextractf128	$0x1, %ymm3, %xmm13
+//	vpermilpd		$0x3, %xmm13, %xmm13
+	vpermpd			$0xff, %ymm3, %ymm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 24(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+	jmp				0f
+
+1:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				8b
+
+0:
+	#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_12x4_vs_lib4, .-inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10   <- E
+// r11   <- inv_diag_E
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10   <- E
+// r11   <- inv_diag_E
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_12x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_12x4_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x12_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	72(%r12), %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	80(%r12), %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	88(%r12), %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x12_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10   <- D
+// r11   <- inv_diag_D
+// r12d  <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11   <- inv_diag_D
+// r12d  <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	64(%r12), %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$10, %r13d
+	jl				0f // ret
+	vbroadcastsd	8(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	72(%r12), %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$11, %r13d
+	jl				0f // ret
+	vbroadcastsd	48(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	80(%r12), %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$12, %r13d
+	jl				0f // ret
+	vbroadcastsd	88(%r10, %r11, 2), %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	88(%r12), %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_12x4_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_12x4_lib4, .-inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+
+	cmpl			$3, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+
+	cmpl			$4, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_12x4_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vmulpd			%ymm11, %ymm12, %ymm11
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vfnmadd231pd	%ymm7, %ymm12, %ymm6
+	vfnmadd231pd	%ymm11, %ymm12, %ymm10
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vfnmadd231pd	%ymm7, %ymm12, %ymm5
+	vfnmadd231pd	%ymm11, %ymm12, %ymm9
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+	vfnmadd231pd	%ymm7, %ymm12, %ymm4
+	vfnmadd231pd	%ymm11, %ymm12, %ymm8
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vmulpd			%ymm10, %ymm12, %ymm10
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vfnmadd231pd	%ymm6, %ymm12, %ymm5
+	vfnmadd231pd	%ymm10, %ymm12, %ymm9
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+	vfnmadd231pd	%ymm6, %ymm12, %ymm4
+	vfnmadd231pd	%ymm10, %ymm12, %ymm8
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vmulpd			%ymm9, %ymm12, %ymm9
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+	vfnmadd231pd	%ymm5, %ymm12, %ymm4
+	vfnmadd231pd	%ymm9, %ymm12, %ymm8
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+	vmulpd			%ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_12x4_lib4, .-inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_12x4_lib4, @function
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+	vmulpd			%ymm8, %ymm12, %ymm8
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm1
+	vfnmadd231pd	%ymm4, %ymm12, %ymm5
+	vfnmadd231pd	%ymm8, %ymm12, %ymm9
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vmulpd			%ymm9, %ymm12, %ymm9
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm2
+	vfnmadd231pd	%ymm4, %ymm12, %ymm6
+	vfnmadd231pd	%ymm8, %ymm12, %ymm10
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm2
+	vfnmadd231pd	%ymm5, %ymm12, %ymm6
+	vfnmadd231pd	%ymm9, %ymm12, %ymm10
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vmulpd			%ymm10, %ymm12, %ymm10
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm3
+	vfnmadd231pd	%ymm4, %ymm12, %ymm7
+	vfnmadd231pd	%ymm8, %ymm12, %ymm11
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm3
+	vfnmadd231pd	%ymm5, %ymm12, %ymm7
+	vfnmadd231pd	%ymm9, %ymm12, %ymm11
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm3
+	vfnmadd231pd	%ymm6, %ymm12, %ymm7
+	vfnmadd231pd	%ymm10, %ymm12, %ymm11
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vmulpd			%ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_12x4_lib4, .-inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vmulpd			%ymm11, %ymm12, %ymm11
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vfnmadd231pd	%ymm7, %ymm12, %ymm6
+	vfnmadd231pd	%ymm11, %ymm12, %ymm10
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vfnmadd231pd	%ymm7, %ymm12, %ymm5
+	vfnmadd231pd	%ymm11, %ymm12, %ymm9
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+	vfnmadd231pd	%ymm7, %ymm12, %ymm4
+	vfnmadd231pd	%ymm11, %ymm12, %ymm8
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vmulpd			%ymm10, %ymm12, %ymm10
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vfnmadd231pd	%ymm6, %ymm12, %ymm5
+	vfnmadd231pd	%ymm10, %ymm12, %ymm9
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+	vfnmadd231pd	%ymm6, %ymm12, %ymm4
+	vfnmadd231pd	%ymm10, %ymm12, %ymm8
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vmulpd			%ymm9, %ymm12, %ymm9
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+	vfnmadd231pd	%ymm5, %ymm12, %ymm4
+	vfnmadd231pd	%ymm9, %ymm12, %ymm8
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+	vmulpd			%ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10   <- E0
+// r11   <- 4*sde*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- E0
+// r11   <- 4*sde*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_12x4_lib4, @function
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#endif
+#endif
+
+	movq	%r10, %r12 // E1 <- E0
+	addq	%r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+	movq	%r12, %r13 // E2 <- E1
+	addq	%r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+	// left block-column
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			0(%r10), %ymm12
+	vblendpd		$0x1, %ymm15, %ymm12, %ymm12
+	vmovapd			0(%r12), %ymm14
+	vmovapd			0(%r13), %ymm15
+	vpermpd			$0x00, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			32(%r10), %ymm12
+	vblendpd		$0x3, %ymm15, %ymm12, %ymm12
+	vmovapd			32(%r12), %ymm14
+	vmovapd			32(%r13), %ymm15
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			64(%r10), %ymm12
+	vblendpd		$0x7, %ymm15, %ymm12, %ymm12
+	vmovapd			64(%r12), %ymm14
+	vmovapd			64(%r13), %ymm15
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vmovapd			96(%r12), %ymm14
+	vmovapd			96(%r13), %ymm15
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xff, %ymm1, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xff, %ymm3, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	addq		$128, %r12
+	addq		$128, %r13
+
+
+	// middle block-column
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			0(%r12), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd			0(%r13), %ymm14
+	vpermpd			$0x00, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			32(%r12), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd			32(%r13), %ymm14
+	vpermpd			$0x55, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			64(%r12), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd			64(%r13), %ymm14
+	vpermpd			$0xaa, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vmovapd			96(%r13), %ymm14
+	vpermpd			$0xff, %ymm4, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0xff, %ymm5, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0xff, %ymm6, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0xff, %ymm7, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+
+	addq		$128, %r13
+
+
+	// right block-column
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r13), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x00, %ymm8, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm9, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm10, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm11, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+	vmovapd			32(%r13), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x55, %ymm8, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm9, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm10, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm11, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+	vmovapd			64(%r13), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vpermpd			$0xaa, %ymm8, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm9, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm10, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm11, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_12x4_lib4, .-inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_12x4_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r13 // E1 <- E0
+	addq	%r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+	movq	%r13, %r14 // E2 <- E1
+	addq	%r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+	// bottom-right
+
+	vmovapd			352(%r14), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	88(%r12), %ymm12
+	vmovapd			352(%r13), %ymm15
+//	vmovapd			352(%r10), %ymm11
+
+	vpermpd			$0xff, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	352(%r10), %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	352(%r10), %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	352(%r10), %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	352(%r10), %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			320(%r14), %xmm13
+	vbroadcastsd	80(%r12), %ymm12
+	vmovapd			320(%r13), %ymm15
+//	vmovapd			320(%r10), %ymm11
+
+	vpermpd			$0xaa, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	320(%r10), %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	320(%r10), %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	320(%r10), %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	320(%r10), %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			288(%r14), %xmm13
+	vbroadcastsd	72(%r12), %ymm12
+	vmovapd			288(%r13), %ymm15
+//	vmovapd			288(%r10), %ymm11
+
+	vpermpd			$0x55, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	288(%r10), %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	288(%r10), %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	288(%r10), %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	288(%r10), %ymm14, %ymm3
+
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			256(%r13), %ymm15
+//	vmovapd			256(%r10), %ymm11
+
+	vpermpd			$0x00, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	256(%r10), %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	256(%r10), %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	256(%r10), %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	256(%r10), %ymm14, %ymm3
+
+
+	// middle-middle
+
+	vmovapd			224(%r13), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm15
+
+	vpermpd			$0xff, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r13), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm15
+
+	vpermpd			$0xaa, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r13), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm15
+
+	vpermpd			$0x55, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm15
+
+	vpermpd			$0x00, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vpermpd			$0xff, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vpermpd			$0xaa, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_12x4_lib4, .-inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // E1 <- E0
+	addq	%r11, %r15 // E1 <- E0 + 4*sde*sizeof(double)
+	movq	%r15, %r14 // E2 <- E1
+	addq	%r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+	// bottom-right
+
+	cmpl	$11, %r13d
+	jle		0f
+
+	vmovapd			352(%r14), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	88(%r12), %ymm12
+	vmovapd			352(%r15), %ymm15
+//	vmovapd			352(%r10), %ymm11
+
+	vpermpd			$0xff, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	352(%r10), %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	352(%r10), %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	352(%r10), %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	352(%r10), %ymm14, %ymm3
+
+0:
+	cmpl	$10, %r13d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			320(%r14), %xmm13
+	vbroadcastsd	80(%r12), %ymm12
+	vmovapd			320(%r15), %ymm15
+//	vmovapd			320(%r10), %ymm11
+
+	vpermpd			$0xaa, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	320(%r10), %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	320(%r10), %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	320(%r10), %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	320(%r10), %ymm14, %ymm3
+
+1:
+	cmpl	$9, %r13d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			288(%r14), %xmm13
+	vbroadcastsd	72(%r12), %ymm12
+	vmovapd			288(%r15), %ymm15
+//	vmovapd			288(%r10), %ymm11
+
+	vpermpd			$0x55, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm13, %ymm14, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	288(%r10), %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm13, %ymm14, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	288(%r10), %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm13, %ymm14, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	288(%r10), %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm13, %ymm14, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	288(%r10), %ymm14, %ymm3
+
+2:
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			256(%r15), %ymm15
+//	vmovapd			256(%r10), %ymm11
+
+	vpermpd			$0x00, %ymm8, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm8, %ymm8
+	vfnmadd231pd	%ymm15, %ymm14, %ymm4
+	vfnmadd231pd	256(%r10), %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm9, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm9, %ymm9
+	vfnmadd231pd	%ymm15, %ymm14, %ymm5
+	vfnmadd231pd	256(%r10), %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm10, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm10, %ymm10
+	vfnmadd231pd	%ymm15, %ymm14, %ymm6
+	vfnmadd231pd	256(%r10), %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm11, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm11, %ymm11
+	vfnmadd231pd	%ymm15, %ymm14, %ymm7
+	vfnmadd231pd	256(%r10), %ymm14, %ymm3
+
+
+	// middle-middle
+
+	vmovapd			224(%r15), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm15
+
+	vpermpd			$0xff, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r15), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm15
+
+	vpermpd			$0xaa, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r15), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm15
+
+	vpermpd			$0x55, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm15
+
+	vpermpd			$0x00, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm15, %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm15, %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm15, %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm15, %ymm14, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vpermpd			$0xff, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vpermpd			$0xaa, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_12x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_l_12x4_lib4, @function
+inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_12x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC05(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd			%ymm0, %ymm12
+	vdivsd			%xmm0, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r10)
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vblendpd		$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vblendpd		$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r10)
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vblendpd		$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vblendpd		$0x2, %ymm2, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vblendpd		$0x4, %ymm2, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r10)
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vblendpd		$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+	vblendpd		$0x2, %ymm3, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+	vblendpd		$0x4, %ymm3, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+	vblendpd		$0x8, %ymm3, %ymm12, %ymm12
+	
+	vpermpd			$0xff, %ymm3, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r10)
+//	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+	vblendpd		$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_l_12x4_lib4, .-inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// middle kernel
+//
+// input arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_m_12x4_lib4, @function
+inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_m_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_m_12x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r14 // E1 <- E0
+	addq	%r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+	movq	%r14, %r13 // E2 <- E1
+	addq	%r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+	// solve upper 4x4 & correct lower 8x4
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			0(%r10), %ymm12
+	vblendpd		$0x1, %ymm15, %ymm12, %ymm12
+	vmovapd			0(%r14), %ymm14
+	vmovapd			0(%r13), %ymm15
+	vpermpd			$0x00, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			32(%r10), %ymm12
+	vblendpd		$0x3, %ymm15, %ymm12, %ymm12
+	vmovapd			32(%r14), %ymm14
+	vmovapd			32(%r13), %ymm15
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			64(%r10), %ymm12
+	vblendpd		$0x7, %ymm15, %ymm12, %ymm12
+	vmovapd			64(%r14), %ymm14
+	vmovapd			64(%r13), %ymm15
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vmovapd			96(%r14), %ymm14
+	vmovapd			96(%r13), %ymm15
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xff, %ymm1, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xff, %ymm3, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+
+	// factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC05(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm4, %ymm12, %ymm12
+	vmovapd			%ymm4, %ymm12
+	vdivsd			%xmm4, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r12)
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vblendpd		$0x1, %ymm12, %ymm4, %ymm4
+
+	// second column
+	vpermpd			$0x00, %ymm5, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vblendpd		$0x2, %ymm5, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm5, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r12)
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vblendpd		$0x3, %ymm12, %ymm5, %ymm5
+
+	// third column
+	vpermpd			$0x00, %ymm6, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vblendpd		$0x2, %ymm6, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm6, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vblendpd		$0x4, %ymm6, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm6, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r12)
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vblendpd		$0x7, %ymm12, %ymm6, %ymm6
+
+	// fourth column
+	vpermpd			$0x00, %ymm7, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+	vblendpd		$0x2, %ymm7, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm7, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+	vblendpd		$0x4, %ymm7, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm7, %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+	vblendpd		$0x8, %ymm7, %ymm12, %ymm12
+	
+	vpermpd			$0xff, %ymm7, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r12)
+//	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+	vblendpd		$0x7, %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_m_12x4_lib4, .-inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// right kernel
+//
+// input arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_r_12x4_lib4, @function
+inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_r_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_r_12x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r14 // E1 <- E0
+	addq	%r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+	movq	%r14, %r13 // E2 <- E1
+	addq	%r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+	// solve upper 8x4 & correct lower 4x4
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			0(%r10), %ymm12
+	vblendpd		$0x1, %ymm15, %ymm12, %ymm12
+	vmovapd			0(%r14), %ymm14
+	vmovapd			0(%r13), %ymm15
+	vpermpd			$0x00, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			32(%r10), %ymm12
+	vblendpd		$0x3, %ymm15, %ymm12, %ymm12
+	vmovapd			32(%r14), %ymm14
+	vmovapd			32(%r13), %ymm15
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vmovapd			64(%r10), %ymm12
+	vblendpd		$0x7, %ymm15, %ymm12, %ymm12
+	vmovapd			64(%r14), %ymm14
+	vmovapd			64(%r13), %ymm15
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	vmovapd			96(%r14), %ymm14
+	vmovapd			96(%r13), %ymm15
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vfnmadd231pd	%ymm15, %ymm13, %ymm8
+	vpermpd			$0xff, %ymm1, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vfnmadd231pd	%ymm15, %ymm13, %ymm9
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vfnmadd231pd	%ymm15, %ymm13, %ymm10
+	vpermpd			$0xff, %ymm3, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+	vfnmadd231pd	%ymm15, %ymm13, %ymm11
+
+	addq		$128, %r14
+	addq		$128, %r13
+
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			0(%r14), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd			0(%r13), %ymm14
+	vpermpd			$0x00, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0x00, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0x00, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0x00, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			32(%r14), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd			32(%r13), %ymm14
+	vpermpd			$0x55, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0x55, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0x55, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0x55, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			64(%r14), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd			64(%r13), %ymm14
+	vpermpd			$0xaa, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0xaa, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0xaa, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0xaa, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+	vmovapd			96(%r13), %ymm14
+	vpermpd			$0xff, %ymm4, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm8
+	vpermpd			$0xff, %ymm5, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm9
+	vpermpd			$0xff, %ymm6, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm10
+	vpermpd			$0xff, %ymm7, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm11
+
+
+
+	// factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC05(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm8, %ymm12, %ymm12
+	vmovapd			%ymm8, %ymm12
+	vdivsd			%xmm8, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r12)
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vblendpd		$0x1, %ymm12, %ymm8, %ymm8
+
+	// second column
+	vpermpd			$0x00, %ymm9, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vblendpd		$0x2, %ymm9, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm9, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r12)
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vblendpd		$0x3, %ymm12, %ymm9, %ymm9
+
+	// third column
+	vpermpd			$0x00, %ymm10, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vblendpd		$0x2, %ymm10, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm10, %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vblendpd		$0x4, %ymm10, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm10, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r12)
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vblendpd		$0x7, %ymm12, %ymm10, %ymm10
+
+	// fourth column
+	vpermpd			$0x00, %ymm11, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+	vblendpd		$0x2, %ymm11, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm11, %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+	vblendpd		$0x4, %ymm11, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm11, %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+	vblendpd		$0x8, %ymm11, %ymm12, %ymm12
+	
+	vpermpd			$0xff, %ymm11, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r12)
+//	vmulpd			%ymm11, %ymm13, %ymm11
+	vblendpd		$0x7, %ymm12, %ymm11, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_r_12x4_lib4, .-inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_12x4_lib4, @function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4,  0(%r10, %r11, 1)
+	vmovapd %ymm5, 32(%r10, %r11, 1)
+	vmovapd %ymm6, 64(%r10, %r11, 1)
+	vmovapd %ymm7, 96(%r10, %r11, 1)
+
+	vmovapd %ymm8,   0(%r10, %r11, 2)
+	vmovapd %ymm9,  32(%r10, %r11, 2)
+	vmovapd %ymm10, 64(%r10, %r11, 2)
+	vmovapd %ymm11, 96(%r10, %r11, 2)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X12_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x12_lib4, @function
+inner_store_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x12_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,   0(%r10)
+	vmovapd %ymm1,  32(%r10)
+	vmovapd %ymm2,  64(%r10)
+	vmovapd %ymm3,  96(%r10)
+
+	vmovapd %ymm4, 128(%r10)
+	vmovapd %ymm5, 160(%r10)
+	vmovapd %ymm6, 192(%r10)
+	vmovapd %ymm7, 224(%r10)
+
+	vmovapd %ymm8, 256(%r10)
+	vmovapd %ymm9, 288(%r10)
+	vmovapd %ymm10, 320(%r10)
+	vmovapd %ymm11, 352(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x12_lib4, .-inner_store_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14  <- dirty
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14  <- dirty
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_12x4_vs_lib4, @function
+inner_store_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC04(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		%ymm4, 0(%r10, %r11, 1)
+	vmaskmovpd	%ymm8, %ymm15,  0(%r10, %r11, 2)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		%ymm5, 32(%r10, %r11, 1)
+	vmaskmovpd	%ymm9, %ymm15, 32(%r10, %r11, 2)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		%ymm6, 64(%r10, %r11, 1)
+	vmaskmovpd	%ymm10, %ymm15, 64(%r10, %r11, 2)
+	je			0f // end
+	vmovapd		%ymm3, 96(%r10)
+	vmovapd		%ymm7, 96(%r10, %r11, 1)
+	vmaskmovpd	%ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_12x4_vs_lib4, .-inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X12_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x12_vs_lib4, @function
+inner_store_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,   0(%r10)
+	vmaskmovpd	%ymm1, %ymm15,  32(%r10)
+	vmaskmovpd	%ymm2, %ymm15,  64(%r10)
+	vmaskmovpd	%ymm3, %ymm15,  96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15, 128(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 160(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 192(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 224(%r10)
+
+	vmaskmovpd	%ymm8, %ymm15, 256(%r10)
+	cmpl		$10, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm9, %ymm15, 288(%r10)
+	cmpl		$11, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm10, %ymm15, 320(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm11, %ymm15, 352(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x12_vs_lib4, .-inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d50 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_lib4, @function
+inner_store_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmovapd		%ymm4, 0(%r10, %r11, 1)
+	vmovapd		%ymm5, 32(%r10, %r11, 1)
+	vmovapd		%ymm6, 64(%r10, %r11, 1)
+	vmovapd		%ymm7, 96(%r10, %r11, 1)
+
+	vmovapd		%ymm8, 0(%r10, %r11, 2)
+	vmovapd		%ymm9, 32(%r10, %r11, 2)
+	vmovapd		%ymm10, 64(%r10, %r11, 2)
+	vmovapd		%ymm11, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_lib4, .-inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d50 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_vs_lib4, @function
+inner_store_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	movq	%r15, %r14 // D2 <- D1
+	addq	%r11, %r14 // D2 <- D1 + 4*sdd*sizeof(double)
+
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC04(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		%ymm4, 0(%r15)
+	vmaskmovpd	%ymm8, %ymm15,  0(%r14)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		%ymm5, 32(%r15)
+	vmaskmovpd	%ymm9, %ymm15, 32(%r14)
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		%ymm6, 64(%r15)
+	vmaskmovpd	%ymm10, %ymm15, 64(%r14)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+	vmovapd		%ymm7, 96(%r15)
+	vmaskmovpd	%ymm11, %ymm15, 96(%r14)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_vs_lib4, .-inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                                1      2              3          4        5          6             7          8        9          10
+// void kernel_dgemm_nt_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_12x4_lib4
+	.type kernel_dgemm_nt_12x4_lib4, @function
+kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_12x4_lib4
+_kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_12x4_lib4
+	.def kernel_dgemm_nt_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_12x4_lib4, .-kernel_dgemm_nt_12x4_lib4
+#endif
+
+
+
+
+
+//                                1      2              3          4          5        6             7          8
+// void kernel_dgemm_nt_4x12_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x12_lib4
+	.type kernel_dgemm_nt_4x12_lib4, @function
+kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x12_lib4
+_kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x12_lib4
+	.def kernel_dgemm_nt_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x12_lib4, .-kernel_dgemm_nt_4x12_lib4
+#endif
+
+
+
+
+
+//                                   rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dgemm_nt_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_12x4_vs_lib4
+	.type kernel_dgemm_nt_12x4_vs_lib4, @function
+kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_12x4_vs_lib4
+_kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_12x4_vs_lib4
+	.def kernel_dgemm_nt_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_12x4_vs_lib4, .-kernel_dgemm_nt_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   1      2              3          4          5        6             7          8          9       10
+// void kernel_dgemm_nt_4x12_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x12_vs_lib4
+	.type kernel_dgemm_nt_4x12_vs_lib4, @function
+kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x12_vs_lib4
+_kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x12_vs_lib4
+	.def kernel_dgemm_nt_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // km
+	movq	ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x12_vs_lib4, .-kernel_dgemm_nt_4x12_vs_lib4
+#endif
+
+
+
+
+
+//                                rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16        rsp+24     rsp+32   rsp+40     rsp+48
+// void kernel_dgemm_nn_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_12x4_lib4
+	.type kernel_dgemm_nn_12x4_lib4, @function
+kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_12x4_lib4
+_kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_12x4_lib4
+	.def kernel_dgemm_nn_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // C
+	movq	ARG10, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_12x4_lib4, .-kernel_dgemm_nn_12x4_lib4
+#endif
+
+
+
+
+
+//                                rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x12_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x12_lib4
+	.type kernel_dgemm_nn_4x12_lib4, @function
+kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x12_lib4
+_kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x12_lib4
+	.def kernel_dgemm_nn_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x12_lib4, .-kernel_dgemm_nn_4x12_lib4
+#endif
+
+
+
+
+
+//                                   rdi     rsi            rdx        rcx      r8         r9       rsp+8         rsp+16     rsp+24   rsp+32     rsp+40   rsp+48  rsp+56
+// void kernel_dgemm_nn_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_12x4_vs_lib4
+	.type kernel_dgemm_nn_12x4_vs_lib4, @function
+kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_12x4_vs_lib4
+_kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_12x4_vs_lib4
+	.def kernel_dgemm_nn_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sda
+	sall	$5, %r14d // 4*sda*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // store address D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km 
+	movq	ARG13, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_12x4_vs_lib4, .-kernel_dgemm_nn_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dsyrk_nt_l_12x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_12x4_lib4
+	.type kernel_dsyrk_nt_l_12x4_lib4, @function
+kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_12x4_lib4
+_kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_12x4_lib4
+	.def kernel_dsyrk_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_12x4_lib4, .-kernel_dsyrk_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dsyrk_nt_l_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_12x4_vs_lib4
+	.type kernel_dsyrk_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_12x4_vs_lib4
+_kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_12x4_vs_lib4
+	.def kernel_dsyrk_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_12x4_vs_lib4, .-kernel_dsyrk_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16     rsp+24
+// void kernel_dtrmm_nn_rl_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_12x4_lib4
+	.type kernel_dtrmm_nn_rl_12x4_lib4, @function
+kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_12x4_lib4
+_kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_12x4_lib4
+	.def kernel_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_12x4_lib4, .-kernel_dtrmm_nn_rl_12x4_lib4
+#endif
+
+
+
+
+
+//                                      1      2              3          4        5            6          7        8          9        10      11
+// void kernel_dtrmm_nn_rl_12x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+	.type kernel_dtrmm_nn_rl_12x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_12x4_vs_lib4
+_kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+	.def kernel_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdb*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_12x4_vs_lib4, .-kernel_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_12x4_lib4
+	.type kernel_dtrmm_nt_ru_12x4_lib4, @function
+kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_12x4_lib4
+_kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_12x4_lib4
+	.def kernel_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10 // A
+	movq	ARG4, %r11 // sda
+	sall	$5, %r11d // 4*sda*sizeof(double)
+	movq	ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_12x4_lib4, .-kernel_dtrmm_nt_ru_12x4_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_12x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_12x4_vs_lib4
+_kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10 // A
+	movq	ARG4, %r11 // sda
+	sall	$5, %r11d // 4*sda*sizeof(double)
+	movq	ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_12x4_vs_lib4, .-kernel_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dpotrf_nt_l_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_12x4_lib4
+	.type kernel_dpotrf_nt_l_12x4_lib4, @function
+kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_12x4_lib4
+_kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_12x4_lib4
+	.def kernel_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_12x4_lib4, .-kernel_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24              rsp+32  rsp+40 
+// void kernel_dpotrf_nt_l_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_12x4_vs_lib4
+	.type kernel_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_12x4_vs_lib4
+	.def kernel_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                         rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_12x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_12x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+//                                            rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56              rsp+64  rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                          rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48 
+// void kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                          1      2          3          4        5          6          7          8        9                   10      11
+// void kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // B
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 32*sdb
+	movq	ARG2, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG9, %r12  // inv_diag_E 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG6, %r10 // store address D
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+//                                                1       2           3         4           5       6           7         8           9          10       11         12       13         14                  15      16
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                                1       2           3           4         5       6           7           8         9          10         11         12       13                 14       15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG3, %r11  // Bp
+	movq	ARG4, %r12 // sdbp
+	sall	$5, %r12d   // 32*sdbp
+	movq	ARG2, %r13  // Ap
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10 // km
+	movq	ARG7, %r11 // Bm
+	movq	ARG8, %r12 // sdbm
+	sall	$5, %r12d // 32*sdbm
+	movq	ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG11, %r10  // E 
+	movq	ARG12, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG13, %r12  // inv_diag_E 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // store address D
+	movq	ARG14, %r11 // km 
+	movq	ARG15, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+//                                       rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32  
+// void kernel_dtrsm_nt_rl_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+//                                       1      2          3          4        5          6          7          8        9
+// void kernel_dtrsm_nt_rl_inv_4x12_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x12_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x12_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11
+	movq	ARG4, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG9, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x12_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_lib4
+#endif
+
+
+
+
+
+//                                             rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+//                                       rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dtrsm_nt_rl_one_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_12x4_lib4
+	.type kernel_dtrsm_nt_rl_one_12x4_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_12x4_lib4
+_kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_12x4_lib4
+	.def kernel_dtrsm_nt_rl_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_12x4_lib4, .-kernel_dtrsm_nt_rl_one_12x4_lib4
+#endif
+
+
+
+
+
+//                                          rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32  rsp+40
+// void kernel_dtrsm_nt_rl_one_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                       rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_12x4_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_12x4_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+//                                          rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                       edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_12x4_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_12x4_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+//                                          edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40              rsp+48  rsp+56
+// void kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                       edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ll_one_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_12x4_lib4
+	.type kernel_dtrsm_nn_ll_one_12x4_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_12x4_lib4
+_kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_12x4_lib4
+	.def kernel_dtrsm_nn_ll_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_12x4_lib4, .-kernel_dtrsm_nn_ll_one_12x4_lib4
+#endif
+
+
+
+
+
+//                                          edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48  tsp+56
+// void kernel_dtrsm_nn_ll_one_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_12x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                       edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48
+// void kernel_dtrsm_nn_lu_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_12x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_12x4_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_12x4_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_lib4
+#endif
+
+
+
+
+
+//                                          edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48               rsp+56  rsp+64
+// void kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E), int km, int kn;
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG13, %r13  // km 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_12X4_vs_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12  // km 
+	movq	ARG14, %r13  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_l_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_12x4_lib4
+	.type kernel_dgetrf_nn_l_12x4_lib4, @function
+kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_12x4_lib4
+_kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_12x4_lib4
+	.def kernel_dgetrf_nn_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_12x4_lib4, .-kernel_dgetrf_nn_l_12x4_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_l_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_12x4_vs_lib4
+	.type kernel_dgetrf_nn_l_12x4_vs_lib4, @function
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_12x4_vs_lib4
+_kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_12x4_vs_lib4
+	.def kernel_dgetrf_nn_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_12x4_vs_lib4, .-kernel_dgetrf_nn_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_m_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_m_12x4_lib4
+	.type kernel_dgetrf_nn_m_12x4_lib4, @function
+kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_m_12x4_lib4
+_kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_m_12x4_lib4
+	.def kernel_dgetrf_nn_m_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG8, %r10 // D
+	subq	$128, %r10 // E
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_m_12x4_lib4, .-kernel_dgetrf_nn_m_12x4_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_m_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_m_12x4_vs_lib4
+	.type kernel_dgetrf_nn_m_12x4_vs_lib4, @function
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_m_12x4_vs_lib4
+_kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_m_12x4_vs_lib4
+	.def kernel_dgetrf_nn_m_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG8, %r10 // D
+	subq	$128, %r10 // E
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_m_12x4_vs_lib4, .-kernel_dgetrf_nn_m_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_r_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_r_12x4_lib4
+	.type kernel_dgetrf_nn_r_12x4_lib4, @function
+kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_r_12x4_lib4
+_kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_r_12x4_lib4
+	.def kernel_dgetrf_nn_r_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG8, %r10 // D
+	subq	$256, %r10 // E
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_r_12x4_lib4, .-kernel_dgetrf_nn_r_12x4_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_r_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_r_12x4_vs_lib4
+	.type kernel_dgetrf_nn_r_12x4_vs_lib4, @function
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_r_12x4_vs_lib4
+_kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_r_12x4_vs_lib4
+	.def kernel_dgetrf_nn_r_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG8, %r10 // D
+	subq	$256, %r10 // E
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_r_12x4_vs_lib4, .-kernel_dgetrf_nn_r_12x4_vs_lib4
+#endif
+
+
+
+
+
+//                               1         2           3         4           5          6           7
+// void kernel_dlarfb12_r_4_lib4(int kmax, double *pV, int sdd, double *pT, double *pD, double *pK, int km);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb12_r_4_lib4
+	.type kernel_dlarfb12_r_4_lib4, @function
+kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb12_r_4_lib4
+_kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb12_r_4_lib4
+	.def kernel_dlarfb12_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb12_r_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+//	vxorpd	%ymm0, %ymm0, %ymm0
+//	vmovapd	%ymm0, %ymm1
+//	vmovapd	%ymm0, %ymm2
+//	vmovapd	%ymm0, %ymm3
+//	vmovapd	%ymm0, %ymm4
+//	vmovapd	%ymm0, %ymm5
+//	vmovapd	%ymm0, %ymm6
+//	vmovapd	%ymm0, %ymm7
+//	vmovapd	%ymm0, %ymm8
+//	vmovapd	%ymm0, %ymm9
+//	vmovapd	%ymm0, %ymm10
+//	vmovapd	%ymm0, %ymm11
+	
+	movq	ARG1, %r10 // k
+	movq	ARG5, %r11 // D
+	movq	ARG2, %r12 // V
+	movq	ARG3, %r13 // sdd
+	sall	$5, %r13d
+
+	//
+	vmovapd			0(%r11), %ymm12
+	vmovapd			%ymm12, %ymm0
+	//
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vmovapd			%ymm12, %ymm1
+	//
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vmovapd			%ymm12, %ymm2
+	//
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vmovapd			%ymm12, %ymm3
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+	//
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vmovapd			%ymm12, %ymm4
+	//
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vmovapd			%ymm12, %ymm5
+	//
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vmovapd			%ymm12, %ymm6
+	//
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vmovapd			%ymm12, %ymm7
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+	//
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	8(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	16(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	24(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	0(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	8(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	16(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	24(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vmovapd			%ymm12, %ymm8
+	//
+	vmovapd			32(%r11), %ymm12
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	40(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	48(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	56(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	32(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	40(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	48(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	56(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	32(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vmovapd			%ymm12, %ymm9
+	//
+	vmovapd			64(%r11), %ymm12
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	80(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	88(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	64(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	72(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	80(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	88(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	64(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	72(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vmovapd			%ymm12, %ymm10
+	//
+	vmovapd			96(%r11), %ymm12
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	120(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	96(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	104(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	112(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	120(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	96(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	104(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	112(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vmovapd			%ymm12, %ymm11
+	//
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	movq	%r11, %r14
+	movq	%r12, %r11
+	movq	%r13, %r12
+	movq	%r14, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_TRAN_12X4_LIB4
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+	INNER_TRAN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_12x4_lib4
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+	call inner_tran_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_12x4_lib4
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+	callq _inner_tran_12x4_lib4
+#endif
+#endif
+
+	movq	ARG4, %r11 // T
+	movq	$384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+	//
+	vbroadcastsd	376(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+	//
+	vbroadcastsd	368(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm10, %ymm13, %ymm11
+	vbroadcastsd	336(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	//
+	vbroadcastsd	360(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm9, %ymm13, %ymm11
+	vbroadcastsd	328(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm9, %ymm13, %ymm10
+	vbroadcastsd	296(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	//
+	vbroadcastsd	352(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm8, %ymm13, %ymm11
+	vbroadcastsd	320(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm8, %ymm13, %ymm10
+	vbroadcastsd	288(%r11, %r12, 2), %ymm13
+	vfmadd231pd		%ymm8, %ymm13, %ymm9
+	vbroadcastsd	256(%r11, %r12, 2), %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	//
+	vbroadcastsd	376(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm7, %ymm13, %ymm11
+	vbroadcastsd	344(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm7, %ymm13, %ymm10
+	vbroadcastsd	312(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm7, %ymm13, %ymm9
+	vbroadcastsd	280(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm7, %ymm13, %ymm8
+	vbroadcastsd	248(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+	//
+	vbroadcastsd	368(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm6, %ymm13, %ymm11
+	vbroadcastsd	336(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm6, %ymm13, %ymm10
+	vbroadcastsd	304(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm6, %ymm13, %ymm9
+	vbroadcastsd	272(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm6, %ymm13, %ymm8
+	vbroadcastsd	240(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm6, %ymm13, %ymm7
+	vbroadcastsd	208(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	//
+	vbroadcastsd	360(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm11
+	vbroadcastsd	328(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm10
+	vbroadcastsd	296(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm9
+	vbroadcastsd	264(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm8
+	vbroadcastsd	232(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm7
+	vbroadcastsd	200(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm5, %ymm13, %ymm6
+	vbroadcastsd	168(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	//
+	vbroadcastsd	352(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm11
+	vbroadcastsd	320(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm10
+	vbroadcastsd	288(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm9
+	vbroadcastsd	256(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm8
+	vbroadcastsd	224(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm7
+	vbroadcastsd	192(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm6
+	vbroadcastsd	160(%r11, %r12, 1), %ymm13
+	vfmadd231pd		%ymm4, %ymm13, %ymm5
+	vbroadcastsd	128(%r11, %r12, 1), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	//
+	vbroadcastsd	376(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm11
+	vbroadcastsd	344(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm10
+	vbroadcastsd	312(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm9
+	vbroadcastsd	280(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm8
+	vbroadcastsd	248(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm7
+	vbroadcastsd	216(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm6
+	vbroadcastsd	184(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm5
+	vbroadcastsd	152(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm4
+	vbroadcastsd	120(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	//
+	vbroadcastsd	368(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm11
+	vbroadcastsd	336(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm10
+	vbroadcastsd	304(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm9
+	vbroadcastsd	272(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm8
+	vbroadcastsd	240(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm7
+	vbroadcastsd	208(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm6
+	vbroadcastsd	176(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm5
+	vbroadcastsd	144(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm4
+	vbroadcastsd	112(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm3
+	vbroadcastsd	80(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	//
+	vbroadcastsd	360(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm11
+	vbroadcastsd	328(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm10
+	vbroadcastsd	296(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm9
+	vbroadcastsd	264(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm8
+	vbroadcastsd	232(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm7
+	vbroadcastsd	200(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm6
+	vbroadcastsd	168(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm5
+	vbroadcastsd	136(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm4
+	vbroadcastsd	104(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm3
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm2
+	vbroadcastsd	40(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	//
+	vbroadcastsd	352(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm11
+	vbroadcastsd	320(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm10
+	vbroadcastsd	288(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm9
+	vbroadcastsd	256(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm8
+	vbroadcastsd	224(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm7
+	vbroadcastsd	192(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm6
+	vbroadcastsd	160(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm5
+	vbroadcastsd	128(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm4
+	vbroadcastsd	96(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm3
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm2
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm1
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+
+	movq	ARG6, %r10 // K
+	movq	ARG7, %r11 // km
+
+	cmpl	$4, %r11d
+	jge		0f
+
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vblendvpd	%ymm15, %ymm11, %ymm14, %ymm11
+	vblendvpd	%ymm15, %ymm10, %ymm14, %ymm10
+	vblendvpd	%ymm15, %ymm9, %ymm14, %ymm9
+	vblendvpd	%ymm15, %ymm8, %ymm14, %ymm8
+	vblendvpd	%ymm15, %ymm7, %ymm14, %ymm7
+	vblendvpd	%ymm15, %ymm6, %ymm14, %ymm6
+	vblendvpd	%ymm15, %ymm5, %ymm14, %ymm5
+	vblendvpd	%ymm15, %ymm4, %ymm14, %ymm4
+	vblendvpd	%ymm15, %ymm3, %ymm14, %ymm3
+	vblendvpd	%ymm15, %ymm2, %ymm14, %ymm2
+	vblendvpd	%ymm15, %ymm1, %ymm14, %ymm1
+	vblendvpd	%ymm15, %ymm0, %ymm14, %ymm0
+
+0:
+	vmovapd			%ymm11, 352(%r10)
+	vmovapd			%ymm10, 320(%r10)
+	vmovapd			%ymm9, 288(%r10)
+	vmovapd			%ymm8, 256(%r10)
+	vmovapd			%ymm7, 224(%r10)
+	vmovapd			%ymm6, 192(%r10)
+	vmovapd			%ymm5, 160(%r10)
+	vmovapd			%ymm4, 128(%r10)
+	vmovapd			%ymm3, 96(%r10)
+	vmovapd			%ymm2, 64(%r10)
+	vmovapd			%ymm1, 32(%r10)
+	vmovapd			%ymm0, 0(%r10)
+
+	movq	ARG1, %r10 // n
+	movq	ARG6, %r11 // K
+	movq	ARG2, %r12 // V
+	movq	ARG3, %r13 // sdd
+	sall	$5, %r13d
+	movq	ARG5, %r14 // D
+
+	// load block from C
+	vmovapd	0(%r14), %ymm0
+	vmovapd	32(%r14), %ymm1
+	vmovapd	64(%r14), %ymm2
+	vmovapd	96(%r14), %ymm3
+	vmovapd	128(%r14), %ymm4
+	vmovapd	160(%r14), %ymm5
+	vmovapd	192(%r14), %ymm6
+	vmovapd	224(%r14), %ymm7
+	vmovapd	256(%r14), %ymm8
+	vmovapd	288(%r14), %ymm9
+	vmovapd	320(%r14), %ymm10
+	vmovapd	352(%r14), %ymm11
+
+	// 0
+	vmovapd			0(%r11), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 1
+	vmovapd			32(%r11), %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm1
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	136(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	168(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	200(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 2
+	vmovapd			64(%r11), %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm2
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	144(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	176(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	208(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	240(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 3
+	vmovapd			96(%r11), %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm3
+	vbroadcastsd	152(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	184(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	216(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	248(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	280(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 4
+	vmovapd			128(%r11), %ymm12
+	vaddpd			%ymm12, %ymm4, %ymm4
+	vbroadcastsd	160(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	256(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	288(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 5
+	vmovapd			160(%r11), %ymm12
+	vaddpd			%ymm12, %ymm5, %ymm5
+	vbroadcastsd	200(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	232(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	264(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	296(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	328(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 6
+	vmovapd			192(%r11), %ymm12
+	vaddpd			%ymm12, %ymm6, %ymm6
+	vbroadcastsd	240(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+	vbroadcastsd	272(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	304(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	336(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	368(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 7
+	vmovapd			224(%r11), %ymm12
+	vaddpd			%ymm12, %ymm7, %ymm7
+	vbroadcastsd	280(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm8
+	vbroadcastsd	312(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	344(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	376(%r12, %r13, 1), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 8
+	vmovapd			256(%r11), %ymm12
+	vaddpd			%ymm12, %ymm8, %ymm8
+	vbroadcastsd	288(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm9
+	vbroadcastsd	320(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	352(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 9
+	vmovapd			288(%r11), %ymm12
+	vaddpd			%ymm12, %ymm9, %ymm9
+	vbroadcastsd	328(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm10
+	vbroadcastsd	360(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 10
+	vmovapd			320(%r11), %ymm12
+	vaddpd			%ymm12, %ymm10, %ymm10
+	vbroadcastsd	368(%r12, %r13, 2), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm11
+	// 11
+	vmovapd			352(%r11), %ymm12
+	vaddpd			%ymm12, %ymm11, %ymm11
+
+	// store block to C
+	vmovapd	%ymm0, 0(%r14)
+	vmovapd	%ymm1, 32(%r14)
+	vmovapd	%ymm2, 64(%r14)
+	vmovapd	%ymm3, 96(%r14)
+	vmovapd	%ymm4, 128(%r14)
+	vmovapd	%ymm5, 160(%r14)
+	vmovapd	%ymm6, 192(%r14)
+	vmovapd	%ymm7, 224(%r14)
+	vmovapd	%ymm8, 256(%r14)
+	vmovapd	%ymm9, 288(%r14)
+	vmovapd	%ymm10, 320(%r14)
+	vmovapd	%ymm11, 352(%r14)
+
+	subl	$12, %r10d
+	addq	$384, %r12
+	addq	$384, %r14
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+100:
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb12_r_4_lib4, .-kernel_dlarfb12_r_4_lib4
+#endif
+
+
+
+
+
+	//                             1         2           3           4           5
+// void kernel_dlarfb4_r_12_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_12_lib4
+	.type kernel_dlarfb4_r_12_lib4, @function
+kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_12_lib4
+_kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_12_lib4
+	.def kernel_dlarfb4_r_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_12_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+//	vxorpd	%ymm0, %ymm0, %ymm0
+//	vmovapd	%ymm0, %ymm1
+//	vmovapd	%ymm0, %ymm2
+//	vmovapd	%ymm0, %ymm3
+//	vmovapd	%ymm0, %ymm4
+//	vmovapd	%ymm0, %ymm5
+//	vmovapd	%ymm0, %ymm6
+//	vmovapd	%ymm0, %ymm7
+//	vmovapd	%ymm0, %ymm8
+//	vmovapd	%ymm0, %ymm9
+//	vmovapd	%ymm0, %ymm10
+//	vmovapd	%ymm0, %ymm11
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	//
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm4
+	vmovapd			0(%r11, %r12, 2), %ymm8
+	//
+	vmovapd			32(%r11), %ymm1
+	vmovapd			32(%r11, %r12, 1), %ymm5
+	vmovapd			32(%r11, %r12, 2), %ymm9
+	vbroadcastsd	32(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm1, %ymm0
+	vfmadd231pd		%ymm13, %ymm5, %ymm4
+	vfmadd231pd		%ymm13, %ymm9, %ymm8
+	//
+	vmovapd			64(%r11), %ymm2
+	vmovapd			64(%r11, %r12, 1), %ymm6
+	vmovapd			64(%r11, %r12, 2), %ymm10
+	vbroadcastsd	64(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm0
+	vfmadd231pd		%ymm13, %ymm6, %ymm4
+	vfmadd231pd		%ymm13, %ymm10, %ymm8
+	vbroadcastsd	72(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm1
+	vfmadd231pd		%ymm13, %ymm6, %ymm5
+	vfmadd231pd		%ymm13, %ymm10, %ymm9
+	//
+	vmovapd			96(%r11), %ymm3
+	vmovapd			96(%r11, %r12, 1), %ymm7
+	vmovapd			96(%r11, %r12, 2), %ymm11
+	vbroadcastsd	96(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm0
+	vfmadd231pd		%ymm13, %ymm7, %ymm4
+	vfmadd231pd		%ymm13, %ymm11, %ymm8
+	vbroadcastsd	104(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm1
+	vfmadd231pd		%ymm13, %ymm7, %ymm5
+	vfmadd231pd		%ymm13, %ymm11, %ymm9
+	vbroadcastsd	112(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm2
+	vfmadd231pd		%ymm13, %ymm7, %ymm6
+	vfmadd231pd		%ymm13, %ymm11, %ymm10
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vmulpd			%ymm11, %ymm12, %ymm11
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vfmadd231pd		%ymm2, %ymm12, %ymm3
+	vfmadd231pd		%ymm6, %ymm12, %ymm7
+	vfmadd231pd		%ymm10, %ymm12, %ymm11
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vmulpd			%ymm10, %ymm12, %ymm10
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm3
+	vfmadd231pd		%ymm5, %ymm12, %ymm7
+	vfmadd231pd		%ymm9, %ymm12, %ymm11
+	vbroadcastsd	72(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm2
+	vfmadd231pd		%ymm5, %ymm12, %ymm6
+	vfmadd231pd		%ymm9, %ymm12, %ymm10
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vmulpd			%ymm9, %ymm12, %ymm9
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm3
+	vfmadd231pd		%ymm4, %ymm12, %ymm7
+	vfmadd231pd		%ymm8, %ymm12, %ymm11
+	vbroadcastsd	64(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm2
+	vfmadd231pd		%ymm4, %ymm12, %ymm6
+	vfmadd231pd		%ymm8, %ymm12, %ymm10
+	vbroadcastsd	32(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm1
+	vfmadd231pd		%ymm4, %ymm12, %ymm5
+	vfmadd231pd		%ymm8, %ymm12, %ymm9
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+	vmulpd			%ymm8, %ymm12, %ymm8
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+	movq	ARG5, %r13 // sdd
+	sall	$5, %r13d
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vmovapd			0(%r12, %r13, 2), %ymm15
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vaddpd			%ymm14, %ymm4, %ymm14
+	vaddpd			%ymm15, %ymm8, %ymm15
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	vmovapd			%ymm15, 0(%r12, %r13, 2)
+	//
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vmovapd			32(%r12, %r13, 2), %ymm15
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vaddpd			%ymm14, %ymm5, %ymm14
+	vaddpd			%ymm15, %ymm9, %ymm15
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+	vmovapd			%ymm15, 32(%r12, %r13, 2)
+	//
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vmovapd			64(%r12, %r13, 2), %ymm15
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vaddpd			%ymm14, %ymm6, %ymm14
+	vaddpd			%ymm15, %ymm10, %ymm15
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+	vmovapd			%ymm15, 64(%r12, %r13, 2)
+	//
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vmovapd			96(%r12, %r13, 2), %ymm15
+	vbroadcastsd	96(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vfmadd231pd		%ymm8, %ymm13, %ymm15
+	vbroadcastsd	104(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vfmadd231pd		%ymm9, %ymm13, %ymm15
+	vbroadcastsd	112(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vfmadd231pd		%ymm10, %ymm13, %ymm15
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vaddpd			%ymm14, %ymm7, %ymm14
+	vaddpd			%ymm15, %ymm11, %ymm15
+	vmovapd			%ymm12, 96(%r12)
+	vmovapd			%ymm14, 96(%r12, %r13, 1)
+	vmovapd			%ymm15, 96(%r12, %r13, 2)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_12_lib4, .-kernel_dlarfb4_r_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+LC04: // { 11.5 10.5 9.5 8.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1075904512
+	.long	0
+	.long	1076035584
+	.long	0
+	.long	1076166656
+ 	.long	0
+	.long	1076297728
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC05: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_4x4_lib4.S b/kernel/avx2/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..c9bf696
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9433 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	vxorpd			%ymm4, %ymm4, %ymm4
+	vmovapd			%ymm4, %ymm5
+	vmovapd			%ymm4, %ymm6
+	vmovapd			%ymm4, %ymm7
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastsd	-32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	-24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	-16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	-8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastsd	-32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	-24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	-16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	-8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+
+	addq	$32, %r11
+	addq	$32, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddpd			%ymm4, %ymm0, %ymm0
+	vaddpd			%ymm5, %ymm1, %ymm1
+	vaddpd			%ymm6, %ymm2, %ymm2
+	vaddpd			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	vxorpd		%ymm4, %ymm4, %ymm4
+	vmovapd		%ymm4, %ymm5
+	vmovapd		%ymm4, %ymm5
+	vmovapd		%ymm4, %ymm6
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 	32(%r12), %ymm13 // B[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 	32(%r11), %ymm10 // A0[4]
+
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+
+	// unroll 1
+	vmovapd 	64(%r12), %ymm12 // B[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm4
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm5
+	vmovapd 	64(%r11), %ymm8 // A0[8]
+
+	vfmadd231pd	%ymm10, %ymm13, %ymm7
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm6
+
+	// unroll 2
+	vmovapd 	96(%r12), %ymm13 // B[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 	96(%r11), %ymm10 // A0[12]
+
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+	vmovapd 	0(%r12), %ymm12 // B[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm4
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm5
+	vmovapd 	0(%r11), %ymm8 // A0[0]
+
+	vfmadd231pd	%ymm10, %ymm13, %ymm7
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm6
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 	32(%r12), %ymm13 // B[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 	32(%r11), %ymm10 // A0[4]
+
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+
+	// unroll 1
+	vmovapd 	64(%r12), %ymm12 // B[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm4
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm5
+	vmovapd 	64(%r11), %ymm8 // A0[8]
+
+	vfmadd231pd	%ymm10, %ymm13, %ymm7
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm6
+
+	// unroll 2
+	vmovapd 	96(%r12), %ymm13 // B[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 	96(%r11), %ymm10 // A0[12]
+
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd 	$0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+//	vmovapd 	0(%r12), %ymm12 // B[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm4
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128	$0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm5
+//	vmovapd 	0(%r11), %ymm8 // A0[0]
+
+	vfmadd231pd	%ymm10, %ymm13, %ymm7
+	vshufpd 	$0x5, %ymm13, %ymm13, %ymm14
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm6
+
+
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vfmadd231pd	%ymm8, %ymm14, %ymm3
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+	vaddpd			%ymm4, %ymm0, %ymm0
+	vaddpd			%ymm5, %ymm1, %ymm1
+	vaddpd			%ymm6, %ymm2, %ymm2
+	vaddpd			%ymm7, %ymm3, %ymm3
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	vxorpd			%ymm4, %ymm4, %ymm4
+	vmovapd			%ymm4, %ymm5
+	vmovapd			%ymm4, %ymm6
+	vmovapd			%ymm4, %ymm7
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastsd	-32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	-24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	-16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	-8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastsd	-32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	-24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	-16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	-8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+
+	addq	$32, %r11
+	addq	$32, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddpd			%ymm4, %ymm0, %ymm0
+	vaddpd			%ymm5, %ymm1, %ymm1
+	vaddpd			%ymm6, %ymm2, %ymm2
+	vaddpd			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	vxorpd			%ymm4, %ymm4, %ymm4
+	vmovapd			%ymm4, %ymm5
+	vmovapd			%ymm4, %ymm6
+	vmovapd			%ymm4, %ymm7
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddpd			%ymm4, %ymm0, %ymm0
+	vaddpd			%ymm5, %ymm1, %ymm1
+	vaddpd			%ymm6, %ymm2, %ymm2
+	vaddpd			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	vxorpd			%ymm4, %ymm4, %ymm4
+	vmovapd			%ymm4, %ymm5
+	vmovapd			%ymm4, %ymm6
+	vmovapd			%ymm4, %ymm7
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm5
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm6
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfnmadd231pd	%ymm13, %ymm12, %ymm3
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddpd			%ymm4, %ymm0, %ymm0
+	vaddpd			%ymm5, %ymm1, %ymm1
+	vaddpd			%ymm6, %ymm2, %ymm2
+	vaddpd			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	40(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	48(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vbroadcastsd	56(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	80(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vbroadcastsd	88(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	-24(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	-16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vmovapd			%ymm12, -32(%r12)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	8(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r10), %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	addq			$32, %r11
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	addq			$32, %r11
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	addq			$32, %r11
+	vbroadcastsd	24(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r14d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r14d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A+3*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$8, %r12 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r14d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$16, %r12 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	104(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	vmovapd			32(%r11), %ymm8
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	vmovapd			64(%r11), %ymm8
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	48(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	56(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r14d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r14d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r14d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r12 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A+2*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r12 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A+1*bs*sizeof(double)
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A+4*bs*sizeof(double)
+	addq			%r13, %r12
+	subq			$24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r10), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+
+	vmovapd			32(%r10), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	32(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	40(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+
+	vmovapd			64(%r10), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	64(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	72(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	80(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+
+	vmovapd			96(%r10), %ymm8
+	vbroadcastsd	96(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	104(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	112(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	120(%r11), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl			$0, %r10d
+	jle				0f
+
+	vmovapd			0(%r11), %ymm8
+	subl			$1, %r10d
+	vbroadcastsd	0(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vbroadcastsd	8(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vbroadcastsd	16(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vbroadcastsd	24(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	addq			$32, %r11
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif	
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif	
+	
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vmovapd		32(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vmovapd		96(%r13), %ymm12
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+
+	jmp		3f
+
+0:
+
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		32(%r13), %ymm13
+	vmovapd		32(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd		96(%r13), %ymm13
+	vmovapd		96(%r15), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif	
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd	.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd	LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd			%xmm0, %xmm0, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+2:
+	vmovsd			%xmm13, 0(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x00, %ymm0, %ymm0, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+4:
+	vmovsd			%xmm13, 8(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r11d
+	jl				0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+6:
+	vmovsd			%xmm13, 16(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x11, %ymm2, %ymm2, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+//	vextractf128	$0x1, %ymm3, %xmm13
+//	vpermilpd		$0x3, %xmm13, %xmm13
+	vpermpd			$0xff, %ymm3, %ymm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 24(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+	jmp				0f
+
+1:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				8b
+
+0:
+	#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	cmpl			$2, %r12d
+	vmulpd			%ymm0, %ymm13, %ymm0
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r12d
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r12d
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	cmpl			$3, %r11d
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	cmpl			$4, %r11d
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm1
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm2
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm2
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm3
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm3
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm3
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+	vxorpd		%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r10), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x00, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+
+	vmovapd			32(%r10), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vpermpd			$0x55, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+
+	vmovapd			64(%r10), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// r12  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl	$3, %r12d
+	jle		0f
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0xf, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+0:
+	cmpl	$2, %r12d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r11), %ymm12
+
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm14
+	vpermilpd		$0x0, %ymm14, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+1:
+	cmpl	$1, %r12d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r11), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd			%ymm0, %ymm12
+	vdivsd			%xmm0, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r10)
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vblendpd		$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vblendpd		$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r10)
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vblendpd		$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vblendpd		$0x2, %ymm2, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vblendpd		$0x4, %ymm2, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r10)
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vblendpd		$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vblendpd		$0x2, %ymm3, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vblendpd		$0x4, %ymm3, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vblendpd		$0x8, %ymm3, %ymm12, %ymm12
+	
+	vpermpd			$0xff, %ymm3, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r10)
+//	vmulpd			%ymm3, %ymm13, %ymm3
+	vblendpd		$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10   <- D
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmaskmovpd	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmaskmovpd	%ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		4f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+3:
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm12, %ymm15
+	vandpd		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm2, %ymm15, 64(%r11)
+	je			3f // end
+	vblendpd	$0x4, %ymm14, %ymm15, %ymm15
+	vmaskmovpd	%ymm3, %ymm15, 96(%r11)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vperm2f128	$0x01, %ymm0, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm0, %ymm0
+
+	vperm2f128	$0x01, %ymm1, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm1, %ymm1
+
+	vperm2f128	$0x01, %ymm2, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm2, %ymm2
+
+	vperm2f128	$0x01, %ymm3, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm3, %ymm3
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm15, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm14
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm14, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x2, %ymm14, %ymm13, %ymm13
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx        r8            r9           rsp+8      rsp+16   rsp+24       rsp+32     rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx       r8         r9       rsp+8         rsp+16    rsp+24     rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_gen_lib4
+	.def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9           rsp+8      rsp+16   rsp+24       rsp+32     rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_gen_lib4
+	.def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx          r8         r9       rsp+8
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi            rdx        rcx          r8         r9       rsp+8        rsp+16     rsp+24   rsp+32  rsp+40  rsp+48  rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10
+	movq	ARG4, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  edi    rsi        rdx        rcx        r8         r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                     edi    rsi        rdx        rcx        r8         r9                  rsp+8   rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        1       2           3           4       5           6           7          8          9
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                           edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24             rsp+32   rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8     
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                            edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8               rsp+16  rsp+24  
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               edi     rsi         rdx         ecx     r8          r9          rsp+8    rsp+16     rsp+24     rsp+32                rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8   rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8         r9         rsp+8                rsp+16  rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double  *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11 // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16  rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx      r8         r9         rsp+8      rsp+16              rsp+24  rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // inv_diag_E 
+	movq	ARG9, %r12  // km 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG9, %r11  // km 
+	movq	ARG10, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx        rcx      r8         r9         rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_lib4
+	.def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx        rcx      r8         r9         rsp+8               rsp+16  rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_4x4_vs_lib4
+	.def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12  // B
+	movq	ARG4, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG7, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG6, %r10 // D
+
+	movq	ARG8, %r11  // km 
+	movq	ARG9, %r12  // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_lib4
+	.def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx        r8            r9         rsp+8      rsp+16  rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlauum_nt_4x4_vs_lib4
+	.def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                             1         2           3           4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_4_lib4
+	.type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_4_lib4
+	.def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+//	vxorpd	%ymm0, %ymm0, %ymm0
+//	vmovapd	%ymm0, %ymm1
+//	vmovapd	%ymm0, %ymm2
+//	vmovapd	%ymm0, %ymm3
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG2, %r12 // V
+
+	//
+	vmovapd			0(%r11), %ymm0
+	//
+	vmovapd			32(%r11), %ymm1
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm1, %ymm0
+	//
+	vmovapd			64(%r11), %ymm2
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm0
+	vbroadcastsd	72(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm1
+	//
+	vmovapd			96(%r11), %ymm3
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm0
+	vbroadcastsd	104(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm1
+	vbroadcastsd	112(%r12), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm2
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vfmadd231pd		%ymm2, %ymm12, %ymm3
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm3
+	vbroadcastsd	72(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm2
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm3
+	vbroadcastsd	64(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm2
+	vbroadcastsd	32(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm1
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vmovapd			%ymm12, 0(%r12)
+	//
+	vmovapd			32(%r12), %ymm12
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vmovapd			%ymm12, 32(%r12)
+	//
+	vmovapd			64(%r12), %ymm12
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vmovapd			%ymm12, 64(%r12)
+	//
+	vmovapd			96(%r12), %ymm12
+	vbroadcastsd	96(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vbroadcastsd	104(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vbroadcastsd	112(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vmovapd			%ymm12, 96(%r12)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x4_lib4.S b/kernel/avx2/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..82a5a86
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,12995 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	addq	$128, %r13
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	addq	$128, %r13
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vmovapd 0(%r13), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vfmadd231pd	%ymm9, %ymm14, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm7
+
+	subl	$4, %r10d
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	vfmadd231pd	%ymm9, %ymm14, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vfmadd231pd	%ymm11, %ymm14, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm7
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm2
+	vfmadd231pd	%ymm11, %ymm14, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vfmadd231pd	%ymm9, %ymm14, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vfmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	vfmadd231pd	%ymm9, %ymm14, %ymm6
+
+
+	// unroll 3
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vfmadd231pd	%ymm11, %ymm14, %ymm5
+
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm7
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm2
+	vfmadd231pd	%ymm11, %ymm14, %ymm6
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r13), %ymm13 // B[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+	vfmadd231pd	%ymm9, %ymm14, %ymm5
+
+	vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm7
+
+	subl	$4, %r10d
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	vfmadd231pd	%ymm9, %ymm14, %ymm6
+
+	// unroll 1
+	vmovapd 64(%r13), %ymm12 // B[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+	vfmadd231pd	%ymm11, %ymm14, %ymm5
+
+	vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+	vfmadd231pd	%ymm10, %ymm13, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm7
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm2
+	vfmadd231pd	%ymm11, %ymm14, %ymm6
+
+	// unroll 2
+	vmovapd 96(%r13), %ymm13 // B[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+	vfmadd231pd	%ymm9, %ymm14, %ymm5
+
+	vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r13
+	vfmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	vfmadd231pd	%ymm9, %ymm14, %ymm6
+
+
+	// unroll 3
+//	vmovapd 0(%r13), %ymm12 // B[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm4
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vfmadd231pd	%ymm10, %ymm14, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+	vfmadd231pd	%ymm11, %ymm14, %ymm5
+//	cmpl	$3, %r10d
+
+//	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vfmadd231pd	%ymm10, %ymm13, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+	vfmadd231pd	%ymm11, %ymm13, %ymm7
+
+	vfmadd231pd	%ymm10, %ymm14, %ymm2
+	vfmadd231pd	%ymm11, %ymm14, %ymm6
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r13), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vfmadd231pd	%ymm9, %ymm12, %ymm4
+	addq	$32, %r11
+
+	vfmadd231pd	%ymm8, %ymm14, %ymm1
+	addq	$32, %r13
+	vfmadd231pd	%ymm9, %ymm14, %ymm5
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vfmadd231pd	%ymm8, %ymm14, %ymm3
+	vfmadd231pd	%ymm9, %ymm14, %ymm7
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vfmadd231pd	%ymm8, %ymm14, %ymm2
+	subl	$1, %r10d
+	vfmadd231pd	%ymm9, %ymm14, %ymm6
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	addq	$128, %r13
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	addq	$128, %r13
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// rbx   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	addq	%r14, %r13
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm0
+	vfmadd231pd		%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm1
+	vfmadd231pd		%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm2
+	vfmadd231pd		%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm10, %ymm12, %ymm3
+	addq	%r14, %r13
+	vfmadd231pd		%ymm11, %ymm12, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k
+// r11   <- A+4*sda*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r13, %r14, 2) // software prefetch
+	prefetcht0	64(%r13, %r14, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	addq	%r14, %r13
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A0
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 1
+	vbroadcastsd	8(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm8 // A0
+
+	vbroadcastsd	40(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	72(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	104(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastsd	16(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A0
+
+	vbroadcastsd	48(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A1
+
+	vbroadcastsd	80(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	112(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 3
+	vbroadcastsd	24(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm0
+	vfnmadd231pd	%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm8 // A0
+
+	vbroadcastsd	56(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm1
+	vfnmadd231pd	%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm9 // A1
+
+	vbroadcastsd	88(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm2
+	vfnmadd231pd	%ymm11, %ymm12, %ymm6
+
+	vbroadcastsd	120(%r13), %ymm12
+	vfnmadd231pd	%ymm10, %ymm12, %ymm3
+	addq	%r14, %r13
+	vfnmadd231pd	%ymm11, %ymm12, %ymm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd			0(%r11), %ymm8 // A0[0]
+	vmovapd 		0(%r11, %r12, 1), %ymm9 // A1[0]
+	vbroadcastsd	0(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm0
+	vfnmadd231pd	%ymm9, %ymm12, %ymm4
+
+	vbroadcastsd	32(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm1
+	vfnmadd231pd	%ymm9, %ymm12, %ymm5
+	addq	$32, %r11
+
+	vbroadcastsd	64(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm2
+	vfnmadd231pd	%ymm9, %ymm12, %ymm6
+	subl	$1, %r10d
+
+	vbroadcastsd	96(%r13), %ymm12
+	vfnmadd231pd	%ymm8, %ymm12, %ymm3
+	vfnmadd231pd	%ymm9, %ymm12, %ymm7
+	addq	$8, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd 		0(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+	prefetcht0	128(%r12, %r13, 2) // software prefetch
+	prefetcht0	192(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			32(%r11), %ymm14 // A
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastsd	8(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastsd	40(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	72(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	104(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	136(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	168(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	200(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	232(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastsd	16(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vmovapd			-32(%r11), %ymm14 // A
+	vbroadcastsd	48(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	80(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	112(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	144(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	176(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	208(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	240(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+
+	// unroll 0
+	vbroadcastsd	24(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm0
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	56(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm1
+	vbroadcastsd	88(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm2
+	vbroadcastsd	120(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm3
+	vbroadcastsd	152(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm4
+	vbroadcastsd	184(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm5
+	vbroadcastsd	216(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm6
+	vbroadcastsd	248(%r12), %ymm12 // B
+	vfmadd231pd		%ymm14, %ymm12, %ymm7
+	addq	%r13, %r12
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastsd	0(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm0
+	vbroadcastsd	32(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm1
+	vbroadcastsd	64(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm2
+	vbroadcastsd	96(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm13, %ymm12, %ymm7
+
+	addq	$32, %r11
+	addq	$8, %r12
+	subl	$1, %r10d
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- B
+// r12   <- C
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- ?
+// r12   <- ?
+// r13   <- 32*sdc
+// ymm0  <- [a00 a10 a20 a30]
+// ymm1  <- [a01 a11 a21 a31]
+// ymm2  <- [a02 a12 a22 a32]
+// ymm3  <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	cmpl	$3, %r10d
+	jle		2f // cleanup loop
+
+	// main loop
+	.p2align 3
+1:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	subl	$4, %r10d
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	40(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	48(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vbroadcastsd	56(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	80(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vbroadcastsd	88(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	addq	$128, %r11
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	-24(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	-16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vbroadcastsd	-8(%r11), %ymm13
+	addq	$128, %r12
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vmovapd			%ymm12, -32(%r12)
+	vmovapd			%ymm14, -32(%r12, %r13, 1)
+
+	cmpl	$3, %r10d
+	jg		1b // main loop
+
+	cmpl	$0, %r10d
+	jle		0f // return
+
+	// cleanup loop
+2:
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vbroadcastsd	0(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	8(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	16(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vbroadcastsd	24(%r11), %ymm13
+	vfmadd231pd		%ymm3, %ymm13, %ymm12
+	vfmadd231pd		%ymm7, %ymm13, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+	jg		2b // main loop
+
+	// return
+0:
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %ebx
+	subl			%r15d, %ebx // 4-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,4-offsetB)
+
+	movl			%r15d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r13 // B+offsetB*sizeof(double)
+
+	movq			%r11, %rax // A1 <- A0
+	addq			%r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12 // A0[0]
+	vmovapd			0(%rax), %ymm14 // A1[0]
+	vbroadcastsd	0(%r13), %ymm13 // B[0]
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vfmadd231pd		%ymm14, %ymm13, %ymm4
+	vbroadcastsd	32(%r13), %ymm13 // B[1]
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vfmadd231pd		%ymm14, %ymm13, %ymm5
+	vbroadcastsd	64(%r13), %ymm13 // B[2]
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vfmadd231pd		%ymm14, %ymm13, %ymm6
+	vbroadcastsd	96(%r13), %ymm13 // B[3]
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vfmadd231pd		%ymm14, %ymm13, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A0+1*bs*sizeof(float)
+	addq			$32, %rax // A1+1*bs*sizeof(float)
+	addq			$8, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	vmovapd			0(%r11), %ymm12
+	vbroadcastsd	0(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm0
+	vbroadcastsd	32(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm1
+	vbroadcastsd	64(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm2
+	vbroadcastsd	96(%r12), %ymm13
+	vfmadd231pd		%ymm12, %ymm13, %ymm3
+	vbroadcastsd	128(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm4
+	vbroadcastsd	160(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm5
+	vbroadcastsd	192(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm6
+	vbroadcastsd	224(%r12), %ymm12 // B
+	vfmadd231pd		%ymm12, %ymm13, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- 4*sda*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // A1 <- A0
+	addq	%r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	vbroadcastsd	0(%r12), %ymm12
+	vmovapd			0(%r10), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			32(%r10), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			32(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	vbroadcastsd	64(%r12), %ymm12
+	vmovapd			64(%r10), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			64(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	72(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	80(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vbroadcastsd	96(%r12), %ymm12
+	vmovapd			96(%r10), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			96(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	104(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	112(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	120(%r12), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	addq			$128, %r10
+	addq			$128, %r12
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	addq			$32, %r11
+	vmovapd			0(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	addq			$32, %r13
+	addq			$32, %r15
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	addq			$32, %r11
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	addq			$32, %r13
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	addq			$32, %r15
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	addq			$32, %r11
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	addq			$32, %r13
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	addq			$32, %r15
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vbroadcastsd	0(%r13), %ymm12
+	subl			$1, %r10d
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm9
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	addq			$32, %r11
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	addq			$32, %r13
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+	addq			$32, %r15
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r15d
+	jg		0f
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r15d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	subl			$3, %r10d // k-3
+	addq			$96, %r11 // A0+3*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$8, %r13 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r15d
+	jg		2f
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	subl			$2, %r10d // k-2
+	addq			$64, %r11 // A0+2*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$16, %r13 // B+bs*sdb*sizeof(double)-2
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	104(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-3
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	vmovapd			32(%r11), %ymm8
+	vmovapd			32(%r11, %r12, 1), %ymm9
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	40(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	72(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	vmovapd			64(%r11), %ymm8
+	vmovapd			64(%r11, %r12, 1), %ymm9
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	48(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	80(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	112(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	vmovapd			96(%r11), %ymm8
+	vmovapd			96(%r11, %r12, 1), %ymm9
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	56(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	88(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	120(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$4, %r10d // k-4
+	addq			$128, %r11 // A0+4*bs*sizeof(double)
+	addq			%r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A0
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// rax   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_8x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	cmpl			$0, %r15d
+	jg				0f // offB>0
+
+	// offB==0
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+0:
+	cmpl			$1, %r15d
+	jg				1f // offB>1
+
+	// offB==1
+
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f // end
+
+1:
+	cmpl			$2, %r15d
+	jg				2f // offB>2
+
+	// offB==2
+
+	addq			$16, %r13 // B+2*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	subl			$1, %r10d // k-2
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	jmp				3f
+
+2:
+	// offB==3
+
+	addq			$24, %r13 // B+3*sizeof(double)
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-1
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			$8, %r13 // B+1*sizeof(double)
+
+	cmpl			$0, %r10d
+	jle				3f // end
+
+	vmovapd			0(%r11), %ymm8
+	vmovapd			0(%r11, %r12, 1), %ymm9
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vfmadd231pd		%ymm9, %ymm12, %ymm4
+	vbroadcastsd	32(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	vfmadd231pd		%ymm9, %ymm12, %ymm5
+	vbroadcastsd	64(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vfmadd231pd		%ymm9, %ymm12, %ymm6
+	vbroadcastsd	96(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	vfmadd231pd		%ymm9, %ymm12, %ymm7
+
+	subl			$1, %r10d // k-4
+	addq			$32, %r11 // A0+1*bs*sizeof(double)
+	addq			%r14, %r13
+	subq			$24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_8x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+	
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+	
+
+	movq	%r10, %r12 // C1 <- C0
+	addq	%r11, %r12 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r12), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r12), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r12), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r12), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	movq	%r13, %rax // C1 <- C0
+	addq	%r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	jmp		3f
+
+0:
+
+	movq	%rax, %rbx // C0
+	addq	%r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	// alg==1
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r15), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r10   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+	vmovapd		128(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- offset
+// r13   <- C
+// r14   <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+	
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	movq	%r13, %rax // C1 <- C0
+	addq	%r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+
+	vxorpd		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovapd		0(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r13), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%rax), %ymm14
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	jmp		3f
+
+0:
+
+	movq	%rax, %rbx // C0
+	addq	%r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$1, %r12d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x1, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x1, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm14, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r12d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x3, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x3, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		0(%r13), %ymm12
+	vmovapd		0(%rax), %ymm13
+	vmovapd		0(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm0
+	vfmadd231pd	%ymm13, %ymm15, %ymm4
+
+	vmovapd		32(%r13), %ymm12
+	vmovapd		32(%rax), %ymm13
+	vmovapd		32(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm1
+	vfmadd231pd	%ymm13, %ymm15, %ymm5
+
+	vmovapd		64(%r13), %ymm12
+	vmovapd		64(%rax), %ymm13
+	vmovapd		64(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm2
+	vfmadd231pd	%ymm13, %ymm15, %ymm6
+
+	vmovapd		96(%r13), %ymm12
+	vmovapd		96(%rax), %ymm13
+	vmovapd		96(%rbx), %ymm14
+	vblendpd	$0x7, %ymm13, %ymm12, %ymm12
+	vblendpd	$0x7, %ymm14, %ymm13, %ymm13
+	vperm2f128	$0x01, %ymm12, %ymm12, %ymm14
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm12
+	vperm2f128	$0x01, %ymm13, %ymm13, %ymm14
+	vshufpd		$0x5, %ymm13, %ymm14, %ymm13
+	vfmadd231pd	%ymm12, %ymm15, %ymm3
+	vfmadd231pd	%ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+	// tc==n
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vblendpd	$0xa, %ymm5, %ymm4, %ymm8
+	vblendpd	$0x5, %ymm5, %ymm4, %ymm9
+	vblendpd	$0xa, %ymm7, %ymm6, %ymm10
+	vblendpd	$0x5, %ymm7, %ymm6, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm4
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm6
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm5
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm7
+
+	movq	%r10, %r15 // C1 <- C0
+	addq	%r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovapd		0(%r10), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+	vmovapd		0(%r15), %ymm15
+	vaddpd		%ymm4, %ymm15, %ymm4
+	vmovapd		32(%r15), %ymm15
+	vaddpd		%ymm5, %ymm15, %ymm5
+	vmovapd		64(%r15), %ymm15
+	vaddpd		%ymm6, %ymm15, %ymm6
+	vmovapd		96(%r15), %ymm15
+	vaddpd		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_11_4x8_lib4, @function
+inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_11_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib4:
+#endif
+#endif
+		
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_11_4x8_lib4, .-inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd			%xmm0, %xmm0, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+2:
+	vmovsd			%xmm13, 0(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	cmpl			$2, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x00, %ymm0, %ymm0, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+4:
+	vmovsd			%xmm13, 8(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	cmpl			$3, %r11d
+	jl				0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+6:
+	vmovsd			%xmm13, 16(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	cmpl			$4, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x11, %ymm2, %ymm2, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+//	vextractf128	$0x1, %ymm3, %xmm13
+//	vpermilpd		$0x3, %xmm13, %xmm13
+	vpermpd			$0xff, %ymm3, %ymm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 24(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+	jmp				0f
+
+1:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x8_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x8_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r11), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+
+
+	vbroadcastsd	8(%r11), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+
+	vbroadcastsd	16(%r11), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+	vbroadcastsd	24(%r11), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- sdd
+// r12  <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	cmpl			$6, %r13d
+	jl				0f // ret
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	cmpl			$7, %r13d
+	jl				0f // ret
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	cmpl			$8, %r13d
+	jl				0f // ret
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$2, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+
+	cmpl			$3, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+
+	cmpl			$4, %r11d
+	jl				0f // ret
+
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vfnmadd231pd	%ymm7, %ymm12, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vfnmadd231pd	%ymm7, %ymm12, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+	vfnmadd231pd	%ymm7, %ymm12, %ymm4
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vfnmadd231pd	%ymm6, %ymm12, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+	vfnmadd231pd	%ymm6, %ymm12, %ymm4
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+	vfnmadd231pd	%ymm5, %ymm12, %ymm4
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl			$3, %r12d
+	jle				0f
+
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm2
+	vfnmadd231pd	%ymm7, %ymm12, %ymm6
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm1
+	vfnmadd231pd	%ymm7, %ymm12, %ymm5
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm3, %ymm12, %ymm0
+	vfnmadd231pd	%ymm7, %ymm12, %ymm4
+
+0:
+	cmpl			$2, %r12d
+	jle				1f
+
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm1
+	vfnmadd231pd	%ymm6, %ymm12, %ymm5
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm0
+	vfnmadd231pd	%ymm6, %ymm12, %ymm4
+
+1:
+	cmpl			$1, %r12d
+	jle				2f
+
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm0
+	vfnmadd231pd	%ymm5, %ymm12, %ymm4
+
+2:
+
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+	// first column
+	vbroadcastsd	0(%r11), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	// second column
+	vbroadcastsd	32(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm1
+	vfnmadd231pd	%ymm4, %ymm12, %ymm5
+	vbroadcastsd	8(%r11), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+
+	// third column
+	vbroadcastsd	64(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm2
+	vfnmadd231pd	%ymm4, %ymm12, %ymm6
+	vbroadcastsd	72(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm2
+	vfnmadd231pd	%ymm5, %ymm12, %ymm6
+	vbroadcastsd	16(%r11), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+
+	// fourth column
+	vbroadcastsd	96(%r10), %ymm12
+	vfnmadd231pd	%ymm0, %ymm12, %ymm3
+	vfnmadd231pd	%ymm4, %ymm12, %ymm7
+	vbroadcastsd	104(%r10), %ymm12
+	vfnmadd231pd	%ymm1, %ymm12, %ymm3
+	vfnmadd231pd	%ymm5, %ymm12, %ymm7
+	vbroadcastsd	112(%r10), %ymm12
+	vfnmadd231pd	%ymm2, %ymm12, %ymm3
+	vfnmadd231pd	%ymm6, %ymm12, %ymm7
+	vbroadcastsd	24(%r11), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E0
+// r11  <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+	movq	%r10, %r12 // E1 <- E0
+	addq	%r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+
+	// left block-column
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			0(%r10), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vmovapd			0(%r12), %ymm14
+	vpermpd			$0x00, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			32(%r10), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vmovapd			32(%r12), %ymm14
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vpermpd			$0x55, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+	vmovapd			64(%r10), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vmovapd			64(%r12), %ymm14
+	vpermpd			$0xaa, %ymm0, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm0
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vpermpd			$0xaa, %ymm1, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm1
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm2
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm3
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+
+	vmovapd			96(%r12), %ymm14
+	vpermpd			$0xff, %ymm0, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm4
+	vpermpd			$0xff, %ymm1, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm5
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm6
+	vpermpd			$0xff, %ymm3, %ymm13
+	vfnmadd231pd	%ymm14, %ymm13, %ymm7
+
+	addq		$128, %r12
+
+
+	// right block-column
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	vmovapd			0(%r12), %ymm12
+	vblendpd		$0x1, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x00, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vpermpd			$0x00, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vpermpd			$0x00, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vpermpd			$0x00, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+
+	vmovapd			32(%r12), %ymm12
+	vblendpd		$0x3, %ymm14, %ymm12, %ymm12
+	vpermpd			$0x55, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vpermpd			$0x55, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vpermpd			$0x55, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vpermpd			$0x55, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+
+	vmovapd			64(%r12), %ymm12
+	vblendpd		$0x7, %ymm14, %ymm12, %ymm12
+	vpermpd			$0xaa, %ymm4, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm4
+	vpermpd			$0xaa, %ymm5, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm5
+	vpermpd			$0xaa, %ymm6, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm6
+	vpermpd			$0xaa, %ymm7, %ymm13
+	vfnmadd231pd	%ymm12, %ymm13, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r13 // E1 <- E0
+	addq	%r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+
+	// bottom-right
+
+	vmovapd			224(%r13), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vpermpd			$0xff, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r13), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vpermpd			$0xaa, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r13), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vpermpd			$0x55, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vpermpd			$0x00, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vpermpd			$0xff, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vpermpd			$0xaa, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- 4*sde*sizeof(double)
+// r12  <- inv_diag_E
+// r13  <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r14 // E1 <- E0
+	addq	%r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+
+	// bottom-right
+
+	cmpl	$7, %r13d
+	jle		0f
+
+	vmovapd			224(%r14), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	56(%r12), %ymm12
+	vmovapd			224(%r10), %ymm11
+
+	vpermpd			$0xff, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+0:
+	cmpl	$6, %r13d
+	jle		1f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+	vmovapd			192(%r14), %xmm13
+	vbroadcastsd	48(%r12), %ymm12
+	vmovapd			192(%r10), %ymm11
+
+	vpermpd			$0xaa, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+1:
+	cmpl	$5, %r13d
+	jle		2f
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			160(%r14), %xmm13
+	vbroadcastsd	40(%r12), %ymm12
+	vmovapd			160(%r10), %ymm11
+
+	vpermpd			$0x55, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0x55, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0x55, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0x55, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+2:
+
+	vbroadcastsd	32(%r12), %ymm12
+	vmovapd			128(%r10), %ymm11
+
+	vpermpd			$0x00, %ymm4, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm4, %ymm4
+	vfnmadd231pd	%ymm11, %ymm14, %ymm0
+
+	vpermpd			$0x00, %ymm5, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm5, %ymm5
+	vfnmadd231pd	%ymm11, %ymm14, %ymm1
+
+	vpermpd			$0x00, %ymm6, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm6, %ymm6
+	vfnmadd231pd	%ymm11, %ymm14, %ymm2
+
+	vpermpd			$0x00, %ymm7, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm7, %ymm7
+	vfnmadd231pd	%ymm11, %ymm14, %ymm3
+
+
+	// top-left
+
+	vmovapd			96(%r10), %ymm13
+	vxorpd			%ymm14, %ymm14, %ymm14 // 0.0
+	vblendpd		$0x7, %ymm13, %ymm14, %ymm13
+	vbroadcastsd	24(%r12), %ymm12
+
+	vpermpd			$0xff, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xff, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xff, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xff, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x8, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovapd			64(%r10), %xmm13
+	vbroadcastsd	16(%r12), %ymm12
+
+	vpermpd			$0xaa, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermpd			$0xaa, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermpd			$0xaa, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermpd			$0xaa, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x4, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vxorpd			%ymm13, %ymm13, %ymm13 // 0.0
+	vmovsd			32(%r10), %xmm13
+	vbroadcastsd	8(%r12), %ymm12
+
+	vpermilpd		$0xf, %ymm0, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm0, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm0
+
+	vpermilpd		$0xf, %ymm1, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm1, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm1
+
+	vpermilpd		$0xf, %ymm2, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm2, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm2
+
+	vpermilpd		$0xf, %ymm3, %ymm14
+	vmulpd			%ymm14, %ymm12, %ymm14
+	vblendpd		$0x2, %ymm14, %ymm3, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm3
+
+
+	vbroadcastsd	0(%r12), %ymm12
+
+	vmulpd			%ymm0, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm0, %ymm0
+
+	vmulpd			%ymm1, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm1, %ymm1
+
+	vmulpd			%ymm2, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm2, %ymm2
+
+	vmulpd			%ymm3, %ymm12, %ymm14
+	vblendpd		$0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+//	vmovddup		%xmm14, %xmm14
+
+	// first column
+//	vblendpd		$0x1, %ymm0, %ymm12, %ymm12
+	vmovapd			%ymm0, %ymm12
+	vdivsd			%xmm0, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 0(%r10)
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vblendpd		$0x1, %ymm12, %ymm0, %ymm0
+
+	// second column
+	vpermpd			$0x00, %ymm1, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vblendpd		$0x2, %ymm1, %ymm13, %ymm12
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 8(%r10)
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vblendpd		$0x3, %ymm12, %ymm1, %ymm1
+
+	// third column
+	vpermpd			$0x00, %ymm2, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vblendpd		$0x2, %ymm2, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm2, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vblendpd		$0x4, %ymm2, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm2, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 16(%r10)
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vblendpd		$0x7, %ymm12, %ymm2, %ymm2
+
+	// fourth column
+	vpermpd			$0x00, %ymm3, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vblendpd		$0x2, %ymm3, %ymm13, %ymm12
+
+	vpermpd			$0x55, %ymm3, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vblendpd		$0x4, %ymm3, %ymm12, %ymm12
+
+	vpermpd			$0xaa, %ymm3, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vblendpd		$0x8, %ymm3, %ymm12, %ymm12
+	
+	vpermpd			$0xff, %ymm3, %ymm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmovsd			%xmm13, 24(%r10)
+//	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vblendpd		$0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4,  0(%r15)
+	vmovapd %ymm5, 32(%r15)
+	vmovapd %ymm6, 64(%r15)
+	vmovapd %ymm7, 96(%r15)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,   0(%r10)
+	vmovapd %ymm1,  32(%r10)
+	vmovapd %ymm2,  64(%r10)
+	vmovapd %ymm3,  96(%r10)
+
+	vmovapd %ymm4, 128(%r10)
+	vmovapd %ymm5, 160(%r10)
+	vmovapd %ymm6, 192(%r10)
+	vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	cmpl		$2, %r13d
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r15)
+	jl			0f // end
+	cmpl		$3, %r13d
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r15)
+	jl			0f // end
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r15)
+	je			0f // end
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmaskmovpd	%ymm0, %ymm15,   0(%r10)
+	vmaskmovpd	%ymm1, %ymm15,  32(%r10)
+	vmaskmovpd	%ymm2, %ymm15,  64(%r10)
+	vmaskmovpd	%ymm3, %ymm15,  96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovpd	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovpd	%ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vmovapd		%ymm0,0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmovapd		%ymm4, 0(%r15)
+	vmovapd		%ymm5, 32(%r15)
+	vmovapd		%ymm6, 64(%r15)
+	vmovapd		%ymm7, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r15  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmovapd		%ymm0, 0(%r10)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		4f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			4f // end
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm13
+#endif
+
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			3f // end
+	vblendpd	$0x4, %ymm13, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	jmp		3f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC08(%rip), %ymm12
+	vmovupd		.LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC08(%rip), %ymm12
+	vmovupd		LC05(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x2, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC09(%rip), %ymm12
+	vmovupd		.LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC09(%rip), %ymm12
+	vmovupd		LC06(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x4, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC10(%rip), %ymm12
+	vmovupd		.LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC10(%rip), %ymm12
+	vmovupd		LC07(%rip), %ymm13
+#endif
+	vandpd		%ymm12, %ymm14, %ymm12
+	vandpd		%ymm13, %ymm15, %ymm13
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+	vmovapd		LC04(%rip), %ymm15
+#endif
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			3f // end
+	vblendpd	$0x8, %ymm15, %ymm12, %ymm12
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			3f // end
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	je			3f // end
+	vblendpd	$0x2, %ymm15, %ymm14, %ymm14
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_lib4
+	.def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4          5        6             7          8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_lib4
+	.def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+//                                  rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_vs_lib4
+	.def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4          5        6             7          8          9       10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x8_vs_lib4
+	.def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // B
+	movq	ARG5, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // km
+	movq	ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x4_gen_lib4
+	.def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16        rsp+24     rsp+32   rsp+40     rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_lib4
+	.def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // C
+	movq	ARG10, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx          r8         r9       rsp+8         rsp+16     rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x8_lib4
+	.def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+//                                   rdi    rsi            rdx        rcx      r8        r9         rsp+8    rsp+16        rsp+24    rsp+32     rsp+40   rsp+48    rsp+56     rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_8x4_gen_lib4
+	.def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_lib4
+	.def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                    rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx      r8         r9            rsp+8        rsp+16     rsp+24   rsp+32       rsp+40     rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x4_gen_lib4
+	.def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8           r9         rsp+8    rsp+16     rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_lib4
+	.def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+//                                     1      2              3          4        5            6          7        8          9        10      11
+// void kernel_dtrmm_nn_rl_8x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+	.type kernel_dtrmm_nn_rl_8x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_vs_lib4
+_kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+	.def kernel_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdb*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_vs_lib4, .-kernel_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      1      2              3          4        5            6          7        8            9          10       11      12      13      14
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+	.def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_lib4
+	.def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10 // A
+	movq	ARG4, %r11 // sda
+	sall	$5, %r11d // 4*sda*sizeof(double)
+	movq	ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+//                                 rdi     rsi            rdx        rcx      r8         r9            rsp+8      rsp+16   rsp+24     rsp+32   rsp+40  rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d //k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	addq	$128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+//	INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+//	call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+//	callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // C
+	movq	ARG8, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+	movq	ARG9, %r10 // store address D
+	movq	ARG10, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_lib4
+	.def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24              rsp+32  rsp+40 
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                           rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56              rsp+64  rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                         1      2          3          4        5          6          7          8        9                   10      11
+// void kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11
+	movq	ARG4, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG9, %r12  // inv_diag_E 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG6, %r10 // store address D
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+//                                               rdi     rsi         rdx       rcx          r8     r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64              rsp+72  rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                                1       2           3           4         5       6           7           8         9          10         11         12       13                 14       15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG3, %r11  // Bp
+	movq	ARG4, %r12 // sdbp
+	sall	$5, %r12d   // 32*sdbp
+	movq	ARG2, %r13  // Ap
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10 // km
+	movq	ARG7, %r11 // Bm
+	movq	ARG8, %r12 // sdbm
+	sall	$5, %r12d // 32*sdbm
+	movq	ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG11, %r10  // E 
+	movq	ARG12, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG13, %r12  // inv_diag_E 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // store address D
+	movq	ARG14, %r11 // km 
+	movq	ARG15, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                      1      2          3          4        5          6          7          8        9
+// void kernel_dtrsm_nt_rl_inv_4x8_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x8_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x8_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11
+	movq	ARG4, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG7, %r10  // E 
+	movq	ARG8, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG9, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x8_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_lib4
+#endif
+
+
+
+
+
+//                                            rdi     rsi         rdx       rcx         r8      r9          rsp+8     rsp+16      rsp+24     rsp+32   rsp+40     rsp+48   rsp+56     rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32  rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG10, %r12 // km 
+	movq	ARG11, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32 
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         rdi    rsi        rdx      rcx        r8         r9       rsp+8      rsp+16   rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // store address D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40              rsp+48  rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48  tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+	.def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG12, %r12 // km
+	movq	ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx      ecx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32     rsp+40   rsp+48              rsp+56  rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+	.def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11 // sde
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG13, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12  // km
+	movq	ARG14, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                                edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_lib4
+	.def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+	// epilogue
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+//                                   edi    rsi        rdx      rcx        r8       r9         rsp+8    rsp+16     rsp+24   rsp+32              rsp+40  rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgetrf_nn_l_8x4_vs_lib4
+	.def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12  // km
+	movq	ARG12, %r13  // km
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+//                             1         2           3           4           5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dlarfb4_r_8_lib4
+	.type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dlarfb4_r_8_lib4
+	.def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+//	vxorpd	%ymm0, %ymm0, %ymm0
+//	vmovapd	%ymm0, %ymm1
+//	vmovapd	%ymm0, %ymm2
+//	vmovapd	%ymm0, %ymm3
+//	vmovapd	%ymm0, %ymm4
+//	vmovapd	%ymm0, %ymm5
+//	vmovapd	%ymm0, %ymm6
+//	vmovapd	%ymm0, %ymm7
+	
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11 // D
+	movq	ARG5, %r12 // sdd
+	sall	$5, %r12d
+	movq	ARG2, %r13 // V
+
+	//
+	vmovapd			0(%r11), %ymm0
+	vmovapd			0(%r11, %r12, 1), %ymm4
+	//
+	vmovapd			32(%r11), %ymm1
+	vmovapd			32(%r11, %r12, 1), %ymm5
+	vbroadcastsd	32(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm1, %ymm0
+	vfmadd231pd		%ymm13, %ymm5, %ymm4
+	//
+	vmovapd			64(%r11), %ymm2
+	vmovapd			64(%r11, %r12, 1), %ymm6
+	vbroadcastsd	64(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm0
+	vfmadd231pd		%ymm13, %ymm6, %ymm4
+	vbroadcastsd	72(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm2, %ymm1
+	vfmadd231pd		%ymm13, %ymm6, %ymm5
+	//
+	vmovapd			96(%r11), %ymm3
+	vmovapd			96(%r11, %r12, 1), %ymm7
+	vbroadcastsd	96(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm0
+	vfmadd231pd		%ymm13, %ymm7, %ymm4
+	vbroadcastsd	104(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm1
+	vfmadd231pd		%ymm13, %ymm7, %ymm5
+	vbroadcastsd	112(%r13), %ymm13
+	vfmadd231pd		%ymm13, %ymm3, %ymm2
+	vfmadd231pd		%ymm13, %ymm7, %ymm6
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+	movq	ARG3, %r10 // T
+
+	//
+	vbroadcastsd	120(%r10), %ymm12
+	vmulpd			%ymm3, %ymm12, %ymm3
+	vmulpd			%ymm7, %ymm12, %ymm7
+	//
+	vbroadcastsd	112(%r10), %ymm12
+	vfmadd231pd		%ymm2, %ymm12, %ymm3
+	vfmadd231pd		%ymm6, %ymm12, %ymm7
+	vbroadcastsd	80(%r10), %ymm12
+	vmulpd			%ymm2, %ymm12, %ymm2
+	vmulpd			%ymm6, %ymm12, %ymm6
+	//
+	vbroadcastsd	104(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm3
+	vfmadd231pd		%ymm5, %ymm12, %ymm7
+	vbroadcastsd	72(%r10), %ymm12
+	vfmadd231pd		%ymm1, %ymm12, %ymm2
+	vfmadd231pd		%ymm5, %ymm12, %ymm6
+	vbroadcastsd	40(%r10), %ymm12
+	vmulpd			%ymm1, %ymm12, %ymm1
+	vmulpd			%ymm5, %ymm12, %ymm5
+	//
+	vbroadcastsd	96(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm3
+	vfmadd231pd		%ymm4, %ymm12, %ymm7
+	vbroadcastsd	64(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm2
+	vfmadd231pd		%ymm4, %ymm12, %ymm6
+	vbroadcastsd	32(%r10), %ymm12
+	vfmadd231pd		%ymm0, %ymm12, %ymm1
+	vfmadd231pd		%ymm4, %ymm12, %ymm5
+	vbroadcastsd	0(%r10), %ymm12
+	vmulpd			%ymm0, %ymm12, %ymm0
+	vmulpd			%ymm4, %ymm12, %ymm4
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // V
+	movq	ARG4, %r12 // D
+	movq	ARG5, %r13 // sdd
+	sall	$5, %r13d
+
+	//
+	vmovapd			0(%r12), %ymm12
+	vmovapd			0(%r12, %r13, 1), %ymm14
+	vaddpd			%ymm12, %ymm0, %ymm12
+	vaddpd			%ymm14, %ymm4, %ymm14
+	vmovapd			%ymm12, 0(%r12)
+	vmovapd			%ymm14, 0(%r12, %r13, 1)
+	//
+	vmovapd			32(%r12), %ymm12
+	vmovapd			32(%r12, %r13, 1), %ymm14
+	vbroadcastsd	32(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vaddpd			%ymm12, %ymm1, %ymm12
+	vaddpd			%ymm14, %ymm5, %ymm14
+	vmovapd			%ymm12, 32(%r12)
+	vmovapd			%ymm14, 32(%r12, %r13, 1)
+	//
+	vmovapd			64(%r12), %ymm12
+	vmovapd			64(%r12, %r13, 1), %ymm14
+	vbroadcastsd	64(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	72(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vaddpd			%ymm12, %ymm2, %ymm12
+	vaddpd			%ymm14, %ymm6, %ymm14
+	vmovapd			%ymm12, 64(%r12)
+	vmovapd			%ymm14, 64(%r12, %r13, 1)
+	//
+	vmovapd			96(%r12), %ymm12
+	vmovapd			96(%r12, %r13, 1), %ymm14
+	vbroadcastsd	96(%r11), %ymm13
+	vfmadd231pd		%ymm0, %ymm13, %ymm12
+	vfmadd231pd		%ymm4, %ymm13, %ymm14
+	vbroadcastsd	104(%r11), %ymm13
+	vfmadd231pd		%ymm1, %ymm13, %ymm12
+	vfmadd231pd		%ymm5, %ymm13, %ymm14
+	vbroadcastsd	112(%r11), %ymm13
+	vfmadd231pd		%ymm2, %ymm13, %ymm12
+	vfmadd231pd		%ymm6, %ymm13, %ymm14
+	vaddpd			%ymm12, %ymm3, %ymm12
+	vaddpd			%ymm14, %ymm7, %ymm14
+	vmovapd			%ymm12, 96(%r12)
+	vmovapd			%ymm14, 96(%r12, %r13, 1)
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x8_lib4.S b/kernel/avx2/kernel_dgemm_8x8_lib4.S
new file mode 100644
index 0000000..954c96d
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x8_lib4.S
@@ -0,0 +1,5625 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define ARG19 STACKSIZE + 104(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define ARG19 STACKSIZE + 152(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_8x8_lib4, @function
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13), %ymm14
+	vbroadcastsd 	0(%r13, %r14, 1), %ymm15
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	subl	$4, %r10d
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	8(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	16(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	24(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	24(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			32(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	32(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			32(%r11, %r12, 1), %ymm13
+	vbroadcastsd	32(%r13, %r14, 1), %ymm15
+
+	// unroll 1
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	40(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	40(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	48(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	48(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	56(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	56(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			64(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	64(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			64(%r11, %r12, 1), %ymm13
+	vbroadcastsd	64(%r13, %r14, 1), %ymm15
+
+	// unroll 2
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	72(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	72(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	80(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	80(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	88(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	88(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			96(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	96(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			96(%r11, %r12, 1), %ymm13
+	vbroadcastsd	96(%r13, %r14, 1), %ymm15
+
+	// unroll 3
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	104(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	104(%r13, %r14, 1), %ymm15
+	addq	$128, %r11
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	112(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	112(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	120(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	120(%r13, %r14, 1), %ymm15
+	addq	$128, %r13
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			0(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	0(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	subl	$4, %r10d
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	8(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	16(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	24(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	24(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			32(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	32(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			32(%r11, %r12, 1), %ymm13
+	vbroadcastsd	32(%r13, %r14, 1), %ymm15
+
+	// unroll 1
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	40(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	40(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	48(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	48(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	56(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	56(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			64(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	64(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			64(%r11, %r12, 1), %ymm13
+	vbroadcastsd	64(%r13, %r14, 1), %ymm15
+
+	// unroll 2
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	72(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	72(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	80(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	80(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	88(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	88(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vmovapd			96(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	96(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+	vmovapd			96(%r11, %r12, 1), %ymm13
+	vbroadcastsd	96(%r13, %r14, 1), %ymm15
+
+	// unroll 3
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	104(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	vbroadcastsd	104(%r13, %r14, 1), %ymm15
+	addq	$128, %r11
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	112(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	vbroadcastsd	112(%r13, %r14, 1), %ymm15
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	120(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	vbroadcastsd	120(%r13, %r14, 1), %ymm15
+	addq	$128, %r13
+
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+//	vmovapd			0(%r11), %ymm12
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+//	vbroadcastsd	0(%r13), %ymm14
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+//	vmovapd			0(%r11, %r12, 1), %ymm13
+//	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13), %ymm14
+	vfmadd231pd		%ymm12, %ymm14, %ymm0
+	vfmadd231pd		%ymm13, %ymm14, %ymm4
+	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+	vfmadd231pd		%ymm13, %ymm15, %ymm8
+	subl	$1, %r10d
+
+	vbroadcastsd	8(%r13), %ymm14
+	vfmadd231pd		%ymm12, %ymm14, %ymm1
+	vfmadd231pd		%ymm13, %ymm14, %ymm5
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+	vfmadd231pd		%ymm13, %ymm15, %ymm9
+	addq		$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm14
+	vfmadd231pd		%ymm12, %ymm14, %ymm2
+	vfmadd231pd		%ymm13, %ymm14, %ymm6
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+	vfmadd231pd		%ymm13, %ymm15, %ymm10
+	addq		$32, %r13
+
+	vbroadcastsd	-8(%r13), %ymm14
+	vfmadd231pd		%ymm12, %ymm14, %ymm3
+	vfmadd231pd		%ymm13, %ymm14, %ymm7
+	vbroadcastsd	-8(%r13, %r14, 1), %ymm15
+	vfmadd231pd		%ymm13, %ymm15, %ymm11
+
+	cmpl		$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_8x8_lib4, .-inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_8x8_lib4, @function
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13), %ymm14
+	vbroadcastsd 	0(%r13, %r14, 1), %ymm15
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	8(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	16(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	24(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	24(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			32(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	32(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			32(%r11, %r12, 1), %ymm13
+	vbroadcastsd	32(%r13, %r14, 1), %ymm15
+
+	// unroll 1
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	40(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	40(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	48(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	48(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	56(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	56(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			64(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	64(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			64(%r11, %r12, 1), %ymm13
+	vbroadcastsd	64(%r13, %r14, 1), %ymm15
+
+	// unroll 2
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	72(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	72(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	80(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	80(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	88(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	88(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			96(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	96(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			96(%r11, %r12, 1), %ymm13
+	vbroadcastsd	96(%r13, %r14, 1), %ymm15
+
+	// unroll 3
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	104(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	104(%r13, %r14, 1), %ymm15
+	addq	$128, %r11
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	112(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	112(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	120(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	120(%r13, %r14, 1), %ymm15
+	addq	$128, %r13
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			0(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	0(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	subl	$4, %r10d
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	8(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	16(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	24(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	24(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			32(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	32(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			32(%r11, %r12, 1), %ymm13
+	vbroadcastsd	32(%r13, %r14, 1), %ymm15
+
+	// unroll 1
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	40(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	40(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	48(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	48(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	56(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	56(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			64(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	64(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			64(%r11, %r12, 1), %ymm13
+	vbroadcastsd	64(%r13, %r14, 1), %ymm15
+
+	// unroll 2
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	72(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	72(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	80(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	80(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	88(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	88(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vmovapd			96(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	96(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+	vmovapd			96(%r11, %r12, 1), %ymm13
+	vbroadcastsd	96(%r13, %r14, 1), %ymm15
+
+	// unroll 3
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	104(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	vbroadcastsd	104(%r13, %r14, 1), %ymm15
+	addq	$128, %r11
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	112(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	vbroadcastsd	112(%r13, %r14, 1), %ymm15
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	120(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	vbroadcastsd	120(%r13, %r14, 1), %ymm15
+	addq	$128, %r13
+
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+//	vmovapd			0(%r11), %ymm12
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+//	vbroadcastsd	0(%r13), %ymm14
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+//	vmovapd			0(%r11, %r12, 1), %ymm13
+//	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm12
+	vmovapd			0(%r11, %r12, 1), %ymm13
+	vbroadcastsd	0(%r13), %ymm14
+	vfnmadd231pd	%ymm12, %ymm14, %ymm0
+	vfnmadd231pd	%ymm13, %ymm14, %ymm4
+	vbroadcastsd	0(%r13, %r14, 1), %ymm15
+	vfnmadd231pd	%ymm13, %ymm15, %ymm8
+	subl	$1, %r10d
+
+	vbroadcastsd	8(%r13), %ymm14
+	vfnmadd231pd	%ymm12, %ymm14, %ymm1
+	vfnmadd231pd	%ymm13, %ymm14, %ymm5
+	vbroadcastsd	8(%r13, %r14, 1), %ymm15
+	vfnmadd231pd	%ymm13, %ymm15, %ymm9
+	addq		$32, %r11
+
+	vbroadcastsd	16(%r13), %ymm14
+	vfnmadd231pd	%ymm12, %ymm14, %ymm2
+	vfnmadd231pd	%ymm13, %ymm14, %ymm6
+	vbroadcastsd	16(%r13, %r14, 1), %ymm15
+	vfnmadd231pd	%ymm13, %ymm15, %ymm10
+	addq		$32, %r13
+
+	vbroadcastsd	-8(%r13), %ymm14
+	vfnmadd231pd	%ymm12, %ymm14, %ymm3
+	vfnmadd231pd	%ymm13, %ymm14, %ymm7
+	vbroadcastsd	-8(%r13, %r14, 1), %ymm15
+	vfnmadd231pd	%ymm13, %ymm15, %ymm11
+
+	cmpl		$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_8x8_lib4, .-inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_lib4, @function
+inner_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib4:
+#endif
+#endif
+		
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		128(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		160(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		192(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		224(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- &alpha
+// r11   <- &beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_8x8_lib4, @function
+inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_8x8_lib4:
+#endif
+#endif
+		
+
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm4, %ymm15, %ymm4
+	vmulpd		%ymm5, %ymm15, %ymm5
+	vmulpd		%ymm6, %ymm15, %ymm6
+	vmulpd		%ymm7, %ymm15, %ymm7
+
+	vunpcklpd	%ymm9, %ymm8, %ymm12
+	vunpckhpd	%ymm9, %ymm8, %ymm13
+	vunpcklpd	%ymm11, %ymm10, %ymm14
+	vunpckhpd	%ymm11, %ymm10, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm8
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm10
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm9
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm11
+
+	vbroadcastsd 0(%r10), %ymm15 // alpha
+
+	vmulpd		%ymm8, %ymm15, %ymm8
+	vmulpd		%ymm9, %ymm15, %ymm9
+	vmulpd		%ymm10, %ymm15, %ymm10
+	vmulpd		%ymm11, %ymm15, %ymm11
+
+	vbroadcastsd 0(%r11), %ymm14 // beta
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovapd		0(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r12), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		128(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		160(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		192(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		224(%r12, %r13, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_8x8_lib4, .-inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x8_lib4, @function
+inner_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib4:
+#endif
+#endif
+		
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		0(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		32(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		64(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		96(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		128(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		160(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		192(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		224(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x8_lib4, .-inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm4  <- [d40 d51 d62 d73]
+// ymm5  <- [d41 d50 d63 d72]
+// ymm6  <- [d43 d52 d61 d70]
+// ymm7  <- [d42 d53 d60 d71]
+// ymm8  <- [d80 d91 da2 db3]
+// ymm9  <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_11_8x8_lib4, @function
+inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_8x8_lib4:
+#endif
+#endif
+		
+
+	vunpcklpd	%ymm1, %ymm0, %ymm12
+	vunpckhpd	%ymm1, %ymm0, %ymm13
+	vunpcklpd	%ymm3, %ymm2, %ymm14
+	vunpckhpd	%ymm3, %ymm2, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
+
+	vunpcklpd	%ymm5, %ymm4, %ymm12
+	vunpckhpd	%ymm5, %ymm4, %ymm13
+	vunpcklpd	%ymm7, %ymm6, %ymm14
+	vunpckhpd	%ymm7, %ymm6, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm4
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm6
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm5
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm7
+
+	vunpcklpd	%ymm9, %ymm8, %ymm12
+	vunpckhpd	%ymm9, %ymm8, %ymm13
+	vunpcklpd	%ymm11, %ymm10, %ymm14
+	vunpckhpd	%ymm11, %ymm10, %ymm15
+
+	vperm2f128	$0x20, %ymm14, %ymm12, %ymm8
+	vperm2f128	$0x31, %ymm14, %ymm12, %ymm10
+	vperm2f128	$0x20, %ymm15, %ymm13, %ymm9
+	vperm2f128	$0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovapd		.LC04(%rip), %ymm14 // beta=1.0
+#else
+	vmovapd		LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+	vmovapd		0(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm0
+	vmovapd		32(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm1
+	vmovapd		64(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm2
+	vmovapd		96(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm3
+
+	vmovapd		128(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm4
+	vmovapd		160(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm5
+	vmovapd		192(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm6
+	vmovapd		224(%r10), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm7
+
+	vmovapd		128(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm8
+	vmovapd		160(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm9
+	vmovapd		192(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm10
+	vmovapd		224(%r10, %r11, 1), %ymm15
+	vfmadd231pd	%ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_11_8x8_lib4, .-inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_8x8_vs_lib4, @function
+inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x8_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd			%xmm0, %xmm0, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+2:
+	vmovsd			%xmm13, 0(%r10)
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vpermpd			$0x55, %ymm0, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vperm2f128		$0x00, %ymm4, %ymm4, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm8
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm11
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+4:
+	vmovsd			%xmm13, 8(%r10)
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vperm2f128		$0x00, %ymm5, %ymm5, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm8
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm11
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+6:
+	vmovsd			%xmm13, 16(%r10)
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vpermpd			$0xff, %ymm2, %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vperm2f128		$0x00, %ymm6, %ymm6, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm8
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm11
+
+	vpermpd			$0xff, %ymm3, %ymm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 24(%r10)
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vperm2f128		$0x00, %ymm7, %ymm7, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm8
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm7, %ymm7, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm11
+
+	vmovsd			%xmm8, %xmm8, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				9f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+10:
+	vmovsd			%xmm13, 32(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$6, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x00, %ymm8, %ymm8, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0x55, %ymm8, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vperm2f128		$0x11, %ymm8, %ymm8, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vpermilpd		$0x3, %xmm9, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				11f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+12:
+	vmovsd			%xmm13, 40(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$7, %r11d
+	jl				0f // ret
+	vperm2f128		$0x11, %ymm9, %ymm9, %ymm12
+	vpermilpd		$0x0, %ymm12, %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vpermilpd		$0xf, %ymm12, %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vextractf128	$0x1, %ymm10, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				13f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+14:
+	vmovsd			%xmm13, 48(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$8, %r11d
+	jl				0f // ret
+//	vperm2f128		$0x11, %ymm10, %ymm10, %ymm12
+//	vpermilpd		$0xf, %ymm12, %ymm13
+	vpermpd			$0xff, %ymm10, %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+//	vextractf128	$0x1, %ymm11, %xmm13
+//	vpermilpd		$0x3, %xmm13, %xmm13
+	vpermpd			$0xff, %ymm11, %ymm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				15f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+16:
+	vmovsd			%xmm13, 56(%r10)
+//	vmovddup		%xmm13, %xmm13
+//	vperm2f128		$0x00, %ymm13, %ymm13, %ymm13
+	vpermpd			$0x00, %ymm13, %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+
+
+	jmp				0f
+
+1:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				8b
+
+9:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				10b
+
+11:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				12b
+
+13:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				14b
+
+15:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp				16b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_8x8_vs_lib4, .-inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x8l_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x8l_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm11
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x8l_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_E
+// r13  <- D
+// r14  <- sdd
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_E
+// r13  <- D
+// r14  <- sdd
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x8u_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x8u_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+
+	vmovapd			0(%r13, %r14, 1), %ymm12
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+
+	vmovapd			32(%r13, %r14, 1), %ymm12
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+	vmovapd			64(%r13, %r14, 1), %ymm12
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+	vmovapd			96(%r13, %r14, 1), %ymm12
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x8u_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm11
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm11
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm11
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm7, %ymm13, %ymm11
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$6, %r13d
+	jl				0f // ret
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$7, %r13d
+	jl				0f // ret
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$8, %r13d
+	jl				0f // ret
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_E
+// r13  <- D
+// r14  <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- sde
+// r12  <- inv_diag_E
+// r13  <- D
+// r14  <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#endif
+#endif
+	
+	vbroadcastsd	0(%r12), %ymm13
+	vmulpd			%ymm0, %ymm13, %ymm0
+	vbroadcastsd	8(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm1
+	vbroadcastsd	16(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm2
+	vbroadcastsd	24(%r10), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm3
+
+	vmovapd			0(%r13, %r14, 1), %ymm12
+	vbroadcastsd	0(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm0, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	8(%r12), %ymm13
+	vmulpd			%ymm1, %ymm13, %ymm1
+	vbroadcastsd	48(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm2
+	vbroadcastsd	56(%r10), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm3
+
+	vmovapd			32(%r13, %r14, 1), %ymm12
+	vbroadcastsd	32(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	40(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm1, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	16(%r12), %ymm13
+	vmulpd			%ymm2, %ymm13, %ymm2
+	vbroadcastsd	88(%r10), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm3
+
+	vmovapd			64(%r13, %r14, 1), %ymm12
+	vbroadcastsd	64(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	72(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	80(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm2, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+
+	vbroadcastsd	24(%r12), %ymm13
+	vmulpd			%ymm3, %ymm13, %ymm3
+
+	vmovapd			96(%r13, %r14, 1), %ymm12
+	vbroadcastsd	96(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm4
+	vfnmadd231pd	%ymm12, %ymm13, %ymm8
+	vbroadcastsd	104(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm5
+	vfnmadd231pd	%ymm12, %ymm13, %ymm9
+	vbroadcastsd	112(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm6
+	vfnmadd231pd	%ymm12, %ymm13, %ymm10
+	vbroadcastsd	120(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm3, %ymm13, %ymm7
+	vfnmadd231pd	%ymm12, %ymm13, %ymm11
+
+	addq	$128, %r10
+
+	vbroadcastsd	32(%r12), %ymm13
+	vmulpd			%ymm4, %ymm13, %ymm4
+	vmulpd			%ymm8, %ymm13, %ymm8
+	cmpl			$6, %r15d
+	jl				0f // ret
+	vbroadcastsd	8(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm5
+	vfnmadd231pd	%ymm8, %ymm13, %ymm9
+	vbroadcastsd	16(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm6
+	vfnmadd231pd	%ymm8, %ymm13, %ymm10
+	vbroadcastsd	24(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm4, %ymm13, %ymm7
+	vfnmadd231pd	%ymm8, %ymm13, %ymm11
+
+	vbroadcastsd	40(%r12), %ymm13
+	vmulpd			%ymm5, %ymm13, %ymm5
+	vmulpd			%ymm9, %ymm13, %ymm9
+	cmpl			$7, %r15d
+	jl				0f // ret
+	vbroadcastsd	48(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm6
+	vfnmadd231pd	%ymm9, %ymm13, %ymm10
+	vbroadcastsd	56(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm5, %ymm13, %ymm7
+	vfnmadd231pd	%ymm9, %ymm13, %ymm11
+
+	vbroadcastsd	48(%r12), %ymm13
+	vmulpd			%ymm6, %ymm13, %ymm6
+	vmulpd			%ymm10, %ymm13, %ymm10
+	cmpl			$8, %r15d
+	jl				0f // ret
+	vbroadcastsd	88(%r10, %r11, 1), %ymm13
+	vfnmadd231pd	%ymm6, %ymm13, %ymm7
+	vfnmadd231pd	%ymm10, %ymm13, %ymm11
+
+	vbroadcastsd	56(%r12), %ymm13
+	vmulpd			%ymm7, %ymm13, %ymm7
+	vmulpd			%ymm11, %ymm13, %ymm11
+
+
+
+//	subq	$128, %r10
+//	vmovapd	0(%r10, %r11, 1), %ymm4
+//	vmovapd	32(%r10, %r11, 1), %ymm5
+//	vmovapd	64(%r10, %r11, 1), %ymm6
+//	vmovapd	96(%r10, %r11, 1), %ymm7
+
+
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8L_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8l_lib4, @function
+inner_store_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8l_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4,  0(%r10, %r11, 1)
+	vmovapd %ymm5, 32(%r10, %r11, 1)
+	vmovapd %ymm6, 64(%r10, %r11, 1)
+	vmovapd %ymm7, 96(%r10, %r11, 1)
+
+	vmovapd %ymm8,  128(%r10, %r11, 1)
+	vmovapd %ymm9,  160(%r10, %r11, 1)
+	vmovapd %ymm10, 192(%r10, %r11, 1)
+	vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8l_lib4, .-inner_store_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8U_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8u_lib4, @function
+inner_store_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8u_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_lib4:
+#endif
+#endif
+	
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmovapd %ymm4, 128(%r10)
+	vmovapd %ymm5, 160(%r10)
+	vmovapd %ymm6, 192(%r10)
+	vmovapd %ymm7, 224(%r10)
+
+	vmovapd %ymm8,  128(%r10, %r11, 1)
+	vmovapd %ymm9,  160(%r10, %r11, 1)
+	vmovapd %ymm10, 192(%r10, %r11, 1)
+	vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8u_lib4, .-inner_store_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8L_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8l_vs_lib4, @function
+inner_store_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+
+	vmaskmovpd	%ymm8, %ymm15, 128(%r10, %r11, 1)
+	cmpl		$6, %r13d
+	jl			0f // end
+	vmaskmovpd	%ymm9, %ymm15, 160(%r10, %r11, 1)
+	cmpl		$7, %r13d
+	jl			0f // end
+	vmaskmovpd	%ymm10, %ymm15, 192(%r10, %r11, 1)
+	je			0f // end
+	vmaskmovpd	%ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8l_vs_lib4, .-inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8U_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8u_vs_lib4, @function
+inner_store_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmovapd %ymm0,  0(%r10)
+	vmovapd %ymm1, 32(%r10)
+	vmovapd %ymm2, 64(%r10)
+	vmovapd %ymm3, 96(%r10)
+
+
+	vmovapd		%ymm4, 128(%r10)
+	vmaskmovpd	%ymm8, %ymm15, 128(%r10, %r11, 1)
+	cmpl		$6, %r13d
+	jl			0f // end
+	vmovapd		%ymm5, 160(%r10)
+	vmaskmovpd	%ymm9, %ymm15, 160(%r10, %r11, 1)
+	cmpl		$7, %r13d
+	jl			0f // end
+	vmovapd		%ymm6, 192(%r10)
+	vmaskmovpd	%ymm10, %ymm15, 192(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		%ymm7, 224(%r10)
+	vmaskmovpd	%ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8u_vs_lib4, .-inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d50 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_lib4, @function
+inner_store_l_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib4:
+#endif
+#endif
+	
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmovapd		%ymm4, 0(%r10, %r11, 1)
+	vmovapd		%ymm5, 32(%r10, %r11, 1)
+	vmovapd		%ymm6, 64(%r10, %r11, 1)
+	vmovapd		%ymm7, 96(%r10, %r11, 1)
+
+	vmovapd		%ymm8, 128(%r10, %r11, 1)
+	vmovapd		160(%r10, %r11, 1), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm9, %ymm9
+	vmovapd		%ymm9, 160(%r10, %r11, 1)
+	vmovapd		192(%r10, %r11, 1), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm10, %ymm10
+	vmovapd		%ymm10, 192(%r10, %r11, 1)
+	vmovapd		224(%r10, %r11, 1), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm11, %ymm11
+	vmovapd		%ymm11, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x8_lib4, .-inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d50 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11   <- 4*sdd*sizeof(double)
+// r12d  <- km
+// r13d  <- kn
+// r14   <- dirty
+// r15   <- dirty
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm4  <- [d40 d50 d60 d70]
+// ymm5  <- [d41 d51 d61 d71]
+// ymm6  <- [d42 d52 d62 d72]
+// ymm7  <- [d43 d53 d63 d73]
+// ymm8  <- [d80 d90 da0 db0]
+// ymm9  <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_vs_lib4, @function
+inner_store_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib4:
+#endif
+#endif
+	
+	vcvtsi2sd	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovupd		LC03(%rip), %ymm14
+#endif
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm15, %ymm14, %ymm15
+
+	vmovapd		%ymm0, 0(%r10)
+	vmovapd		32(%r10), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm1, %ymm1	
+	vmovapd		%ymm1, 32(%r10)
+	vmovapd		64(%r10), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm2, %ymm2	
+	vmovapd		%ymm2, 64(%r10)
+	vmovapd		96(%r10), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm3, %ymm3	
+	vmovapd		%ymm3, 96(%r10)
+
+	vmaskmovpd	%ymm4, %ymm15,  0(%r10, %r11, 1)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r10, %r11, 1)
+
+	vmaskmovpd	%ymm8, %ymm15, 128(%r10, %r11, 1)
+	cmpl		$6, %r13d
+	jl			0f // end
+	vmovapd		160(%r10, %r11, 1), %ymm14
+	vblendpd	$0x1, %ymm14, %ymm9, %ymm9
+	vmaskmovpd	%ymm9, %ymm15, 160(%r10, %r11, 1)
+	cmpl		$7, %r13d
+	jl			0f // end
+	vmovapd		192(%r10, %r11, 1), %ymm14
+	vblendpd	$0x3, %ymm14, %ymm10, %ymm10
+	vmaskmovpd	%ymm10, %ymm15, 192(%r10, %r11, 1)
+	je			0f // end
+	vmovapd		224(%r10, %r11, 1), %ymm14
+	vblendpd	$0x7, %ymm14, %ymm11, %ymm11
+	vmaskmovpd	%ymm11, %ymm15, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x8_vs_lib4, .-inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// rbp  <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_gen_lib4, @function
+inner_store_8x8_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib4:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2sd	%r13d, %xmm14, %xmm14
+	vcvtsi2sd	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm12
+	vmovupd		.LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm12
+	vmovupd		LC03(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vmovddup	%xmm15, %xmm15
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$1, %xmm15, %ymm15, %ymm15
+	vsubpd		%ymm12, %ymm14, %ymm14
+	vsubpd		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm3, %ymm2
+	vmovapd		%ymm7, %ymm6
+	vmovapd		%ymm8, %ymm7
+	vmovapd		%ymm9, %ymm8
+	vmovapd		%ymm10, %ymm9
+	vmovapd		%ymm11, %ymm10
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm2, %ymm1
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm7, %ymm6
+	vmovapd		%ymm8, %ymm7
+	vmovapd		%ymm9, %ymm8
+	vmovapd		%ymm10, %ymm9
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%ymm1, %ymm0
+	vmovapd		%ymm5, %ymm4
+	vmovapd		%ymm6, %ymm5
+	vmovapd		%ymm7, %ymm6
+	vmovapd		%ymm8, %ymm7
+	vmovapd		%ymm9, %ymm8
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmaskmovpd	%ymm0, %ymm14,  0(%r11)
+	vmaskmovpd	%ymm1, %ymm14, 32(%r11)
+	vmaskmovpd	%ymm2, %ymm14, 64(%r11)
+	vmaskmovpd	%ymm3, %ymm14, 96(%r11)
+
+	vmaskmovpd	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	vmaskmovpd	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm7, %ymm15, 96(%r11, %r12, 1)
+
+	vmaskmovpd	%ymm8, %ymm15, 128(%r11, %r12, 1)
+	cmpl		$6, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm9, %ymm15, 160(%r11, %r12, 1)
+	cmpl		$7, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm10, %ymm15, 192(%r11, %r12, 1)
+	je			4f // end
+	vmaskmovpd	%ymm11, %ymm15, 224(%r11, %r12, 1)
+
+	jmp		4f
+
+0:
+	
+	cmpl	$1, %r10d
+	jg		1f
+
+	// offset==1
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm12
+	vshufpd		$0x5, %ymm0, %ymm12, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm4, %ymm12, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm12
+	vshufpd		$0x5, %ymm1, %ymm12, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm5, %ymm12, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm12
+	vshufpd		$0x5, %ymm2, %ymm12, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm6, %ymm12, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm12
+	vshufpd		$0x5, %ymm3, %ymm12, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm7, %ymm12, %ymm7
+
+	vperm2f128	$0x01, %ymm8, %ymm8, %ymm12
+	vshufpd		$0x5, %ymm8, %ymm12, %ymm8
+
+	vperm2f128	$0x01, %ymm9, %ymm9, %ymm12
+	vshufpd		$0x5, %ymm9, %ymm12, %ymm9
+
+	vperm2f128	$0x01, %ymm10, %ymm10, %ymm12
+	vshufpd		$0x5, %ymm10, %ymm12, %ymm10
+
+	vperm2f128	$0x01, %ymm11, %ymm11, %ymm12
+	vshufpd		$0x5, %ymm11, %ymm12, %ymm11
+
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm15, %ymm12, %ymm15
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC08(%rip), %ymm14, %ymm12
+	vandpd		.LC05(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+	vandpd		LC08(%rip), %ymm14, %ymm12
+	vandpd		LC05(%rip), %ymm15, %ymm13
+#endif
+
+	vblendpd	$0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC08(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+	vandpd		LC08(%rip), %ymm15, %ymm15
+#endif
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	// offset==2
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x03, %ymm4, %ymm0, %ymm0
+	vperm2f128	$0x03, %ymm13, %ymm4, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x03, %ymm5, %ymm1, %ymm1
+	vperm2f128	$0x03, %ymm13, %ymm5, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x03, %ymm6, %ymm2, %ymm2
+	vperm2f128	$0x03, %ymm13, %ymm6, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x03, %ymm7, %ymm3, %ymm3
+	vperm2f128	$0x03, %ymm13, %ymm7, %ymm7
+
+	vperm2f128	$0x01, %ymm8, %ymm8, %ymm8
+
+	vperm2f128	$0x01, %ymm9, %ymm9, %ymm9
+
+	vperm2f128	$0x01, %ymm10, %ymm10, %ymm10
+
+	vperm2f128	$0x01, %ymm11, %ymm11, %ymm11
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC09(%rip), %ymm14, %ymm12
+	vandpd		.LC06(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+	vandpd		LC09(%rip), %ymm14, %ymm12
+	vandpd		LC06(%rip), %ymm15, %ymm13
+#endif
+
+	vblendpd	$0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC09(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+	vandpd		LC09(%rip), %ymm15, %ymm15
+#endif
+
+	jmp		3f
+
+2:
+
+	// offset==3
+
+	vmovapd		%ymm0, %ymm13
+	vperm2f128	$0x21, %ymm0, %ymm4, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm4, %ymm0
+	vperm2f128	$0x21, %ymm4, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm4
+
+	vmovapd		%ymm1, %ymm13
+	vperm2f128	$0x21, %ymm1, %ymm5, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm5, %ymm1
+	vperm2f128	$0x21, %ymm5, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm5
+
+	vmovapd		%ymm2, %ymm13
+	vperm2f128	$0x21, %ymm2, %ymm6, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm6, %ymm2
+	vperm2f128	$0x21, %ymm6, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm6
+
+	vmovapd		%ymm3, %ymm13
+	vperm2f128	$0x21, %ymm3, %ymm7, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm7, %ymm3
+	vperm2f128	$0x21, %ymm7, %ymm13, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm13, %ymm7
+
+	vperm2f128	$0x01, %ymm8, %ymm8, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm8, %ymm8
+
+	vperm2f128	$0x01, %ymm9, %ymm9, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm9, %ymm9
+
+	vperm2f128	$0x01, %ymm10, %ymm10, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm10, %ymm10
+
+	vperm2f128	$0x01, %ymm11, %ymm11, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm11, %ymm11
+
+	vperm2f128	$0x01, %ymm14, %ymm14, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm14, %ymm14
+	vperm2f128	$0x01, %ymm15, %ymm15, %ymm12
+	vshufpd		$0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC10(%rip), %ymm14, %ymm12
+	vandpd		.LC07(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+	vandpd		LC10(%rip), %ymm14, %ymm12
+	vandpd		LC07(%rip), %ymm15, %ymm13
+#endif
+
+	vblendpd	$0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vandpd		.LC10(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+	vandpd		LC10(%rip), %ymm15, %ymm15
+#endif
+
+3:
+
+	vmaskmovpd	%ymm0, %ymm12, 0(%r11)
+	vmaskmovpd	%ymm4, %ymm14, 0(%r11, %r12, 1)
+	vmaskmovpd	%ymm0, %ymm13, 0(%r11, %r12, 2)
+	vmaskmovpd	%ymm1, %ymm12, 32(%r11)
+	vmaskmovpd	%ymm5, %ymm14, 32(%r11, %r12, 1)
+	vmaskmovpd	%ymm1, %ymm13, 32(%r11, %r12, 2)
+	vmaskmovpd	%ymm2, %ymm12, 64(%r11)
+	vmaskmovpd	%ymm6, %ymm14, 64(%r11, %r12, 1)
+	vmaskmovpd	%ymm2, %ymm13, 64(%r11, %r12, 2)
+	vmaskmovpd	%ymm3, %ymm12, 96(%r11)
+	vmaskmovpd	%ymm7, %ymm14, 96(%r11, %r12, 1)
+	vmaskmovpd	%ymm3, %ymm13, 96(%r11, %r12, 2)
+
+	vmaskmovpd	%ymm8, %ymm15, 128(%r11, %r12, 1)
+	vmaskmovpd	%ymm8, %ymm13, 128(%r11, %r12, 2)
+	cmpl		$6, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm9, %ymm15, 160(%r11, %r12, 1)
+	vmaskmovpd	%ymm9, %ymm13, 160(%r11, %r12, 2)
+	cmpl		$7, %r15d
+	jl			4f // end
+	vmaskmovpd	%ymm10, %ymm15, 192(%r11, %r12, 1)
+	vmaskmovpd	%ymm10, %ymm13, 192(%r11, %r12, 2)
+	je			4f // end
+	vmaskmovpd	%ymm11, %ymm15, 224(%r11, %r12, 1)
+	vmaskmovpd	%ymm11, %ymm13, 224(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib4, .-inner_store_8x8_gen_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6        7             8          9        10         11
+// void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x8l_lib4
+	.type kernel_dgemm_nt_8x8l_lib4, @function
+kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x8l_lib4
+_kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x8l_lib4
+	.def kernel_dgemm_nt_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x8l_lib4, .-kernel_dgemm_nt_8x8l_lib4
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6        7             8          9        10         11
+// void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x8u_lib4
+	.type kernel_dgemm_nt_8x8u_lib4, @function
+kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x8u_lib4
+_kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x8u_lib4
+	.def kernel_dgemm_nt_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG5, %r11 // B
+	movq	ARG6, %r12 // sdb
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG3, %r13 // A
+	movq	ARG4, %r14 // sda
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x8u_lib4, .-kernel_dgemm_nt_8x8u_lib4
+#endif
+
+
+
+
+
+//                                   1      2              3          4        5          6        7             8          9        10         11       12      13
+// void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x8l_vs_lib4
+	.type kernel_dgemm_nt_8x8l_vs_lib4, @function
+kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x8l_vs_lib4
+_kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x8l_vs_lib4
+	.def kernel_dgemm_nt_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG12, %r12 // km 
+	movq	ARG13, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x8l_vs_lib4, .-kernel_dgemm_nt_8x8l_vs_lib4
+#endif
+
+
+
+
+
+//                                   1      2              3          4        5          6        7             8          9        10         11       12      13
+// void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x8u_vs_lib4
+	.type kernel_dgemm_nt_8x8u_vs_lib4, @function
+kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x8u_vs_lib4
+_kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x8u_vs_lib4
+	.def kernel_dgemm_nt_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG5, %r11 // B
+	movq	ARG6, %r12 // sdb
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG3, %r13 // A
+	movq	ARG4, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG12, %r12 // km 
+	movq	ARG13, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x8u_vs_lib4, .-kernel_dgemm_nt_8x8u_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+//                                   1      2              3          4        5          6        7             8         9          10       11        12         13       14      15      16      17
+// void kernel_dgemm_nt_8x8_gen_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_8x8_gen_lib4
+	.type kernel_dgemm_nt_8x8_gen_lib4, @function
+kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_8x8_gen_lib4
+_kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_8x8_gen_lib4
+	.def kernel_dgemm_nt_8x8_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_8x8_lib4, .-kernel_dgemm_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4        5          6        7             8          9        10         11
+// void kernel_dsyrk_nt_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x8_lib4
+	.type kernel_dsyrk_nt_l_8x8_lib4, @function
+kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x8_lib4
+_kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x8_lib4
+	.def kernel_dsyrk_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x8_lib4, .-kernel_dsyrk_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+
+//                                  1      2              3          4        5          6        7             8          9        10         11       12      13
+// void kernel_dsyrk_nt_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_8x8_vs_lib4
+	.type kernel_dsyrk_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_8x8_vs_lib4
+_kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_8x8_vs_lib4
+	.def kernel_dsyrk_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG5, %r13 // B
+	movq	ARG6, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // C
+	movq	ARG9, %r13 // sdc
+	sall	$5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG10, %r10 // D
+	movq	ARG11, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG12, %r12 // D
+	movq	ARG13, %r13 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_8x8_vs_lib4, .-kernel_dsyrk_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+
+//                                  1      2          3        4          5        6          7        8          9        10
+// void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x8_lib4
+	.type kernel_dpotrf_nt_l_8x8_lib4, @function
+kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x8_lib4
+_kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x8_lib4
+	.def kernel_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+	movl	$8, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x8_lib4, .-kernel_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+//                                     1      2          3        4          5        6          7        8          9        10                  11      12
+// void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_8x8_vs_lib4
+	.type kernel_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_8x8_vs_lib4
+	.def kernel_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13 // B
+	movq	ARG5, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG10, %r10  // inv_diag_D 
+	movq	ARG12, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG11, %r12 // km 
+	movq	ARG12, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+//                                        1       2           3         4           5         6       7           8         9           10        11         12       13         14       15
+// void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x8_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11 // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d // 4*sdap*sizeof(double)
+	movq	ARG4, %r13 // Bp
+	movq	ARG5, %r14 // sdbp
+	sall	$5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	movq	ARG6, %r10 // km
+	movq	ARG7, %r11 // Am
+	movq	ARG8, %r12 // sdam
+	sall	$5, %r12d // 4*sdam*sizeof(double)
+	movq	ARG9, %r13 // Bm
+	movq	ARG10, %r14 // sdbm
+	sall	$5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG11, %r10 // C
+	movq	ARG12, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG15, %r10  // inv_diag_D 
+	movl	$8, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG13, %r10 // store address D
+	movq	ARG14, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x8_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+//                                           1       2           3         4           5         6       7           8         9           10        11         12       13         14       15                  16      17
+// void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11 // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d // 4*sdap*sizeof(double)
+	movq	ARG4, %r13 // Bp
+	movq	ARG5, %r14 // sdbp
+	sall	$5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG6, %r10 // km
+	movq	ARG7, %r11 // Am
+	movq	ARG8, %r12 // sdam
+	sall	$5, %r12d // 4*sdam*sizeof(double)
+	movq	ARG9, %r13 // Bm
+	movq	ARG10, %r14 // sdbm
+	sall	$5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG11, %r10 // C
+	movq	ARG12, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG15, %r10  // inv_diag_D 
+	movq	ARG17, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG13, %r10 // store address D
+	movq	ARG14, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG16, %r12 // km 
+	movq	ARG17, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+//                                       1      2          3        4          5        6          7        8          9        10         11       12
+// void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x8l_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x8l_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x8l_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_lib4
+#endif
+
+
+
+
+
+//                                       1      2          3        4          5        6          7        8          9        10         11       12
+// void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x8u_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x8u_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG4, %r11
+	movq	ARG5, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+	movq	ARG3, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG8, %r13 // D
+	movq	ARG9, %r14 // sdd
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x8u_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_lib4
+#endif
+
+
+
+
+
+//                                          1      2          3        4          5        6          7        8          9        10         11       12                  13      14
+// void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG14, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG13, %r12 // km 
+	movq	ARG14, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+//                                          1      2          3        4          5        6          7        8          9        10         11       12                  13      14
+// void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG4, %r11
+	movq	ARG5, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+	movq	ARG3, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG6, %r10 // C
+	movq	ARG7, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG10, %r10  // E 
+	movq	ARG11, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG12, %r12  // inv_diag_E 
+	movq	ARG8, %r13 // D
+	movq	ARG9, %r14 // sdd
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+	movq	ARG14, %r15 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // store address D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG13, %r12 // km 
+	movq	ARG14, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+//                                                1       2           3         4           5         6       7           8          9          10        11         12       13         14       15         16       17                  18      19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+	movq	ARG5, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	movq	ARG6, %r10
+	movq	ARG7, %r11
+	movq	ARG8, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG9, %r13
+	movq	ARG10, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG11, %r10
+	movq	ARG12, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG15, %r10  // E 
+	movq	ARG16, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG17, %r12  // inv_diag_E 
+	movq	ARG19, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG13, %r10 // store address D
+	movq	ARG14, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG18, %r12 // km 
+	movq	ARG19, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+//                                                1       2           3         4           5         6       7           8          9          10        11         12       13         14       15         16       17                  18      19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovapd	%ymm0, %ymm8
+	vmovapd	%ymm0, %ymm9
+	vmovapd	%ymm0, %ymm10
+	vmovapd	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG4, %r11
+	movq	ARG5, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG2, %r13
+	movq	ARG3, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+	movq	ARG6, %r10
+	movq	ARG9, %r11
+	movq	ARG10, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG7, %r13
+	movq	ARG8, %r14
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG11, %r10 // C
+	movq	ARG12, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG15, %r10  // E 
+	movq	ARG16, %r11  // sde 
+	sall	$5, %r11d // 4*sde*sizeof(double)
+	movq	ARG17, %r12  // inv_diag_E 
+	movq	ARG13, %r13 // D
+	movq	ARG14, %r14 // sdd
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+	movq	ARG19, %r15 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG13, %r10 // store address D
+	movq	ARG14, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG18, %r12 // km 
+	movq	ARG19, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+	.long	0
+	.long	-1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	-1074790400
+
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemv_8_lib4.S b/kernel/avx2/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..1c9185a
--- /dev/null
+++ b/kernel/avx2/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1543 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm4  <- [z0 z1 z2 z3]_c
+// ymm5  <- [z4 z5 z6 z7]_c
+// ymm6  <- [z0 z1 z2 z3]_d
+// ymm7  <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+k*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm4  <- [z0 z1 z2 z3]_c
+// ymm5  <- [z4 z5 z6 z7]_c
+// ymm6  <- [z0 z1 z2 z3]_d
+// ymm7  <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	
+	subl	$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd			32(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vmovapd			32(%r15), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd			64(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm4
+	vmovapd			64(%r15), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm5
+
+	vbroadcastsd	24(%r13), %ymm12
+	addq			$32, %r13
+	vmovapd			96(%r11), %ymm8
+	addq			$128, %r11
+	vfmadd231pd		%ymm8, %ymm12, %ymm6
+	vmovapd			96(%r15), %ymm8
+	addq			$128, %r15
+	vfmadd231pd		%ymm8, %ymm12, %ymm7
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd			0(%r11), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm0, %ymm15, %ymm0
+	vmovapd			0(%r15), %ymm8
+	vmulpd			%ymm8, %ymm12, %ymm15
+	vaddpd			%ymm1, %ymm15, %ymm1
+	
+	addq	$32, %r11
+	addq	$32, %r15
+	addq	$8, %r13
+	
+	subl	$1, %r10d
+	cmpl	$0, %r10d
+
+	jg		0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x+k*sizeof(double)
+// ymm0  <- [z0a z0b z0c z0d]
+// ymm1  <- [z1a z1b z1c z1d]
+// ymm2  <- [z2a z2b z2c z2d]
+// ymm3  <- [z3a z3b z3c z3d]
+// ymm4  <- [z4a z4b z4c z4d]
+// ymm5  <- [z5a z5b z5c z5d]
+// ymm6  <- [z6a z6b z6c z6d]
+// ymm7  <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+
+	vmovapd	128(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm5
+	
+	vmovapd	192(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm7
+	
+	addq	%r12, %r11
+	addq	$32, %r13
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm14
+
+	vmaskmovpd	0(%r13), %ymm14, %ymm12
+
+	vmovapd	0(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm0
+	
+	vmovapd	32(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm1
+	
+	vmovapd	64(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm2
+
+	vmovapd	96(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm3
+		
+	vmovapd	128(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm4
+	
+	vmovapd	160(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm5
+	
+	vmovapd	192(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm6
+
+	vmovapd	224(%r11), %ymm8
+	vfmadd231pd	%ymm8, %ymm12, %ymm7
+
+	sall	$3, %r10d
+//	movslq	%r10d, %r10
+	addq	%r10, %r11
+	addq	%r10, %r13
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm4  <- [z0 z1 z2 z3]_c
+// ymm5  <- [z4 z5 z6 z7]_c
+// ymm6  <- [z0 z1 z2 z3]_d
+// ymm7  <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- k-4
+// r11   <- A+4*4*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- x+4*sizeof(double)
+// r15   <- dirty
+// ymm0  <- [z0 z1 z2 z3]_a
+// ymm1  <- [z4 z5 z6 z7]_a
+// ymm2  <- [z0 z1 z2 z3]_b
+// ymm3  <- [z4 z5 z6 z7]_b
+// ymm4  <- [z0 z1 z2 z3]_c
+// ymm5  <- [z4 z5 z6 z7]_c
+// ymm6  <- [z0 z1 z2 z3]_d
+// ymm7  <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+	
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+	vxorpd			%ymm14, %ymm14, %ymm14
+
+	// first 4 columns
+	vmovapd			0(%r11), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	0(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	
+	subl			$4, %r10d
+
+	vmovapd			32(%r11), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	8(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	
+	vmovapd			64(%r11), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vbroadcastsd	16(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm4
+
+	vmovapd			96(%r11), %ymm8
+	vbroadcastsd	24(%r13), %ymm12
+	vfmadd231pd		%ymm8, %ymm12, %ymm6
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+
+
+
+	// last 4 columns
+	vbroadcastsd	0(%r13), %ymm12
+	vmovapd			0(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm0
+	vmovapd			0(%r15), %ymm8
+	vblendpd		$0x1, %ymm8, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm1
+	
+	subl			$4, %r10d
+
+	vbroadcastsd	8(%r13), %ymm12
+	vmovapd			32(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm2
+	vmovapd			32(%r15), %ymm8
+	vblendpd		$0x3, %ymm8, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm3
+	
+	vbroadcastsd	16(%r13), %ymm12
+	vmovapd			64(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm4
+	vmovapd			64(%r15), %ymm8
+	vblendpd		$0x7, %ymm8, %ymm14, %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm5
+
+	vbroadcastsd	24(%r13), %ymm12
+	vmovapd			96(%r11), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm6
+	vmovapd			96(%r15), %ymm8
+	vfmadd231pd		%ymm8, %ymm12, %ymm7
+	
+	addq			$128, %r11
+	addq			$128, %r15
+	addq			$32, %r13
+	
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+	vaddpd	%ymm4, %ymm6, %ymm4
+	vaddpd	%ymm5, %ymm7, %ymm5
+	vaddpd	%ymm0, %ymm4, %ymm0
+	vaddpd	%ymm1, %ymm5, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vfmadd231pd	%ymm15, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vfmadd231pd	%ymm15, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+	vmulpd	%ymm1, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd		0(%r12), %ymm14
+	vfmadd231pd	%ymm15, %ymm14, %ymm0
+	vmovupd		32(%r12), %ymm14
+	vfmadd231pd	%ymm15, %ymm14, %ymm1
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_N_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vaddpd	%ymm0, %ymm2, %ymm0
+	vaddpd	%ymm1, %ymm3, %ymm1
+	vaddpd	%ymm4, %ymm6, %ymm4
+	vaddpd	%ymm5, %ymm7, %ymm5
+	vaddpd	%ymm0, %ymm4, %ymm0
+	vaddpd	%ymm1, %ymm5, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm0 <- [z4a z4b z4c z4d]
+// ymm1 <- [z5a z5b z5c z5d]
+// ymm2 <- [z6a z6b z6c z6d]
+// ymm3 <- [z7a z7b z7c z7d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11   <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLENDER_T_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm5, %ymm4, %ymm4
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vhaddpd	%ymm7, %ymm6, %ymm6
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm3
+	vperm2f128	$0x2, %ymm4, %ymm6, %ymm5
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vperm2f128	$0x13, %ymm4, %ymm6, %ymm4
+	vaddpd	%ymm0, %ymm3, %ymm0
+	vaddpd	%ymm4, %ymm5, %ymm1
+
+	cmpl	$0, %r10d // alg
+	je		0f // return
+
+	cmpl	$1, %r10d // alg
+	jne		1f // alg==-1
+
+	// alg==1
+	vmovupd		0(%r11), %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+
+	jmp		0f // return
+
+1:
+
+	// alg==-1
+	vmovupd		0(%r11), %ymm15
+	vsubpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r11), %ymm15
+	vsubpd		%ymm1, %ymm15, %ymm1
+
+0: // return
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+//                            rdi    rsi            rdx        rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_n_8_lib4
+	.type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_n_8_lib4
+	.def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+	// store
+
+	movq	ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+//                            rdi    rsi           rdx         rcx      r8         r9            rsp+8      rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_t_8_lib4
+	.type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_t_8_lib4
+	.def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG5, %r13  // x
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+	// call inner blender t
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11   // beta
+	movq	ARG7, %r12 // y 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+//                             rdi    rsi        rdx      rcx        r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmv_un_8_lib4
+	.type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmv_un_8_lib4
+	.def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dtrmv edge & dgemv kernel n
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11  // A
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG4, %r13  // x
+
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+	// call inner blender n
+
+#if MACRO_LEVEL>=1
+	INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_8_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..b1329fe
--- /dev/null
+++ b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1435 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h>  // SSE
+#include <emmintrin.h>  // SSE2
+#include <pmmintrin.h>  // SSE3
+#include <smmintrin.h>  // SSE4
+#include <immintrin.h>  // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[1] = idamax+1;
+	if(tmp0!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		inv = _mm_loaddup_pd( &pA[1+bs*1] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[1], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[1] = 0.0;
+		}
+
+
+	// third column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[2] = idamax+2;
+	if(tmp0!=0)
+		{
+		if(ipiv[2]!=2)
+			drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		inv = _mm_loaddup_pd( &pA[2+bs*2] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[2], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[2] = 0.0;
+		}
+
+
+	// fourth column
+
+	// scale & correct & find pivot
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[3] = idamax+3;
+	if(tmp0!=0)
+		{
+		if(ipiv[3]!=3)
+			drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		inv = _mm_loaddup_pd( &pA[3+bs*3] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[3], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[3] = 0.0;
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
+	
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	__m128d
+		max0, max1, msk0, imx0, imx1,
+		inv;
+	
+		
+	__m256d
+		lft, msk,
+		sgn, vna, max, imx, idx,
+		ones,
+		tmp,
+		a_0,
+		b_0, b_1, b_2,
+		scl,
+		c_0,
+		d_0;
+	
+	double
+		dlft;
+
+	sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+	vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+	lft  = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+	double
+		tmp0;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	int B_pref = bs*sda;
+	
+
+	// first column
+
+	// find pivot
+	pB = &pA[0+bs*0];
+	idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	k = 0;
+	for( ; k<m-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0] );
+//		__builtin_prefetch( pB+2*B_pref );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for( ; k<m-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0] );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<m)
+		{
+		dlft = m-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		a_0 = _mm256_load_pd( &pB[0] );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+		a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+		msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, a_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		inv = _mm_loaddup_pd( &pA[0+bs*0] );
+		inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+		scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+		_mm_store_sd( &inv_diag_A[0], inv );
+		}
+	else
+		{
+		scl = ones;
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*0], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*0] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*0], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+
+	// second column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	c_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	d_0 = _mm256_sub_pd( c_0, tmp );
+	d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*0], a_0 );
+	_mm256_store_pd( &pA[0+bs*1], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*0], a_0 );
+		_mm256_store_pd( &pB[0+bs*1], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>1)
+		{
+		ipiv[1] = idamax+1;
+		if(tmp0!=0)
+			{
+			if(ipiv[1]!=1)
+				drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+			inv = _mm_loaddup_pd( &pA[1+bs*1] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[1], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[1] = 0.0;
+			}
+		}
+
+	if(n==2)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*1], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*1] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*1], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// third column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*2] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*1], a_0 );
+	_mm256_store_pd( &pA[0+bs*2], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*1], a_0 );
+		_mm256_store_pd( &pB[0+bs*2], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>2)
+		{
+		ipiv[2] = idamax+2;
+		if(tmp0!=0)
+			{
+			if(ipiv[2]!=2)
+				drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			inv = _mm_loaddup_pd( &pA[2+bs*2] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[2], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n==3)
+		{
+		// scale & return
+		dlft = m;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		a_0 = _mm256_load_pd( &pA[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		_mm256_store_pd( &pA[0+bs*2], a_0 );
+		pB = pA + B_pref;
+		k = 0;
+		for(; k<ma-7; k+=8)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		for(; k<ma-3; k+=4)
+			{
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			a_0 = _mm256_mul_pd( a_0, scl );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+			pB += B_pref;
+			}
+		if(k<ma)
+			{
+			dlft = ma-k;
+			msk = _mm256_broadcast_sd( &dlft );
+			msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+			a_0 = _mm256_load_pd( &pB[0+bs*2] );
+			tmp = _mm256_mul_pd( a_0, scl );
+			a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+			_mm256_store_pd( &pB[0+bs*2], a_0 );
+	//		pB += B_pref;
+			}
+
+		return;
+		}
+
+	// fourth column
+
+	// scale & correct & find pivot
+	dlft = m;
+	msk = _mm256_broadcast_sd( &dlft );
+	msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+	idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+	max = _mm256_setzero_pd();
+	imx = _mm256_setzero_pd();
+	c_0 = _mm256_load_pd( &pA[0+bs*3] );
+	b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_0 = _mm256_permute_pd( b_0, 0x0 );
+	a_0 = _mm256_load_pd( &pA[0+bs*0] );
+	tmp = _mm256_mul_pd( a_0, b_0 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+	b_1 = _mm256_permute_pd( b_1, 0xf );
+	a_0 = _mm256_load_pd( &pA[0+bs*1] );
+	tmp = _mm256_mul_pd( a_0, b_1 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	a_0 = _mm256_load_pd( &pA[0+bs*2] );
+	tmp = _mm256_mul_pd( a_0, scl );
+	b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+	tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+	a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+	b_2 = _mm256_permute_pd( b_2, 0x0 );
+	tmp = _mm256_mul_pd( a_0, b_2 );
+	tmp = _mm256_sub_pd( c_0, tmp );
+	tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+	c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+	_mm256_store_pd( &pA[0+bs*2], a_0 );
+	_mm256_store_pd( &pA[0+bs*3], c_0 );
+	c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+	c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+	c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+	msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+	max = _mm256_blendv_pd( max, c_0, msk );
+	imx = _mm256_blendv_pd( imx, idx, msk );
+	idx = _mm256_add_pd( idx, vna );
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+//		__builtin_prefetch( pB+2*B_pref );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		a_0 = _mm256_mul_pd( a_0, scl );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		c_0 = _mm256_sub_pd( c_0, tmp );
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+		idx = _mm256_add_pd( idx, vna );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		a_0 = _mm256_load_pd( &pB[0+bs*0] );
+		tmp = _mm256_mul_pd( a_0, b_0 );
+		d_0 = _mm256_sub_pd( c_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*1] );
+		tmp = _mm256_mul_pd( a_0, b_1 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		a_0 = _mm256_load_pd( &pB[0+bs*2] );
+		tmp = _mm256_mul_pd( a_0, scl );
+		a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+		tmp = _mm256_mul_pd( a_0, b_2 );
+		d_0 = _mm256_sub_pd( d_0, tmp );
+		c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+		_mm256_store_pd( &pB[0+bs*2], a_0 );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+		c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+		msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+		max = _mm256_blendv_pd( max, c_0, msk );
+		imx = _mm256_blendv_pd( imx, idx, msk );
+//		idx = _mm256_add_pd( idx, vna );
+//		pB += B_pref;
+		}
+	max0 = _mm256_extractf128_pd( max, 0x0 );
+	max1 = _mm256_extractf128_pd( max, 0x1 );
+	imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+	imx1 = _mm256_extractf128_pd( imx, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	max1 = _mm_permute_pd( max0, 0x1 );
+	imx1 = _mm_permute_pd( imx0, 0x1 );
+	msk0 = _mm_cmp_pd( max1, max0, 14 );
+	max0 = _mm_blendv_pd( max0, max1, msk0 );
+	imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+	_mm_store_sd( &tmp0, max0 );
+	idamax = _mm_cvtsd_si32( imx0 );
+
+	// compute scaling
+	if(m>3)
+		{
+		ipiv[3] = idamax+3;
+		if(tmp0!=0)
+			{
+			if(ipiv[3]!=3)
+				drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			inv = _mm_loaddup_pd( &pA[3+bs*3] );
+			inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+			scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+			_mm_store_sd( &inv_diag_A[3], inv );
+			}
+		else
+			{
+			scl = ones;
+			inv_diag_A[3] = 0.0;
+			}
+		}
+
+	// scale
+	pB = pA + B_pref;
+	k = 0;
+	for(; k<ma-7; k+=8)
+		{
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+//		__builtin_prefetch( pB+2*B_pref+8 );
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	for(; k<ma-3; k+=4)
+		{
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		c_0 = _mm256_mul_pd( c_0, scl );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+		pB += B_pref;
+		}
+	if(k<ma)
+		{
+		dlft = ma-k;
+		msk = _mm256_broadcast_sd( &dlft );
+		msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+		c_0 = _mm256_load_pd( &pB[0+bs*3] );
+		tmp = _mm256_mul_pd( c_0, scl );
+		c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+		_mm256_store_pd( &pB[0+bs*3], c_0 );
+//		pB += B_pref;
+		}
+
+	return;
+
+	}
+
+
diff --git a/kernel/avx2/kernel_dsymv_6_lib4.S b/kernel/avx2/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..7a4411c
--- /dev/null
+++ b/kernel/avx2/kernel_dsymv_6_lib4.S
@@ -0,0 +1,996 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4  <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5  <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$4, %r10d
+	jl		0f // clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	vmovupd	0(%r13), %ymm12
+	vmovupd	0(%r14), %ymm13
+
+	vmovapd	0(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm0
+	vfmadd231pd	%ymm14, %ymm6, %ymm13
+	
+	subl	$4, %r10d
+
+	vmovapd	32(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm1
+	vfmadd231pd	%ymm14, %ymm7, %ymm13
+	
+	vmovapd	64(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm2
+	vfmadd231pd	%ymm14, %ymm8, %ymm13
+
+	vmovapd	96(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm3
+	vfmadd231pd	%ymm14, %ymm9, %ymm13
+	
+	vmovapd	128(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm4
+	vfmadd231pd	%ymm14, %ymm10, %ymm13
+	
+	vmovapd	160(%r11), %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm5
+	vfmadd231pd	%ymm14, %ymm11, %ymm13
+	
+	vmovupd	%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	cmpl	$3, %r10d
+
+	jg		1b // main loop 
+
+
+	// consider clean-up
+	cmpl	$0, %r10d
+	jle		2f // return
+
+0: // clean-up
+	
+	vcvtsi2sd	%r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovupd		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovupd		LC02(%rip), %ymm13
+#endif
+	vmovddup	%xmm14, %xmm14
+	vinsertf128	$1, %xmm14, %ymm14, %ymm14
+	vsubpd		%ymm14, %ymm13, %ymm15
+
+	vmaskmovpd	0(%r13), %ymm15, %ymm12
+	vmaskmovpd	0(%r14), %ymm15, %ymm13
+
+	vmaskmovpd	0(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm0
+	vfmadd231pd	%ymm14, %ymm6, %ymm13
+	
+	vmaskmovpd	32(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm1
+	vfmadd231pd	%ymm14, %ymm7, %ymm13
+	
+	vmaskmovpd	64(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm2
+	vfmadd231pd	%ymm14, %ymm8, %ymm13
+
+	vmaskmovpd	96(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm3
+	vfmadd231pd	%ymm14, %ymm9, %ymm13
+	
+	vmaskmovpd	128(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm4
+	vfmadd231pd	%ymm14, %ymm10, %ymm13
+	
+	vmaskmovpd	160(%r11), %ymm15, %ymm14
+	vfmadd231pd	%ymm14, %ymm12, %ymm5
+	vfmadd231pd	%ymm14, %ymm11, %ymm13
+	
+	vmaskmovpd	%ymm13, %ymm15, 0(%r14)
+
+	sall	$3, %r10d
+	addq	%r10, %r11
+	addq	%r10, %r13
+	addq	%r10, %r14
+	xorl	%r10d, %r10d
+	
+	
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10   <- kmax
+// r11   <- A
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t
+// r14   <- z_n
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- kmax-4
+// r11   <- A+4*k*sizeof(double)
+// r12   <- bs*sda*sizeof(double) = 32*sda
+// r13   <- x_t+k*sizeof(double)
+// r14   <- z_n+k*sizeof(double)
+// ymm0  <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1  <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2  <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3  <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6  <- x_n_0
+// ymm7  <- x_n_1
+// ymm8  <- x_n_2
+// ymm9  <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+	vmovupd		0(%r13), %ymm12
+	vmovupd		0(%r14), %ymm13
+
+	vmovapd		0(%r11), %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm6, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		32(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x1, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm7, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovapd		64(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x3, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm8, %ymm15
+	vaddpd		%ymm13, %ymm15, %ymm13
+
+	vmovapd		96(%r11), %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+	vblendpd	$0x7, %ymm15, %ymm14, %ymm14
+	vmulpd		%ymm14, %ymm12, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+//	vxorpd		%ymm15, %ymm15, %ymm15
+//	vblendpd	$0x0, %ymm14, %ymm15, %ymm14
+//	vmulpd		%ymm14, %ymm9, %ymm15
+//	vaddpd		%ymm13, %ymm15, %ymm13
+	
+	vmovupd		%ymm13, 0(%r14) 
+
+	addq	%r12, %r11
+	addq	$32, %r13
+	addq	$32, %r14
+	
+	subq	$4, %r10
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- beta
+// r12  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd			%ymm1, %ymm0, %ymm0
+	vhaddpd			%ymm3, %ymm2, %ymm2
+	vhaddpd			%ymm5, %ymm4, %ymm4
+//	vhaddpd			%ymm3, %ymm2, %ymm2
+	vperm2f128		$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128		$0x13, %ymm0, %ymm2, %ymm0
+	vextractf128	$0x1, %ymm4, %xmm5
+	vaddpd			%ymm0, %ymm1, %ymm0
+	vaddpd			%ymm4, %ymm5, %ymm4
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd			%ymm0, %ymm15, %ymm0
+	vmulpd			%ymm4, %ymm15, %ymm1
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm15
+	vmovupd			0(%r12), %ymm14
+	vmovupd			32(%r12), %ymm13
+	vfmadd231pd		%ymm15, %ymm14, %ymm0
+	vfmadd231pd		%ymm15, %ymm13, %ymm1
+	
+	vxorpd			%ymm15, %ymm15, %ymm15
+	vblendpd		$0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- alpha
+// r11  <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+	// reduction
+	vhaddpd	%ymm1, %ymm0, %ymm0
+	vhaddpd	%ymm3, %ymm2, %ymm2
+	vperm2f128	$0x2, %ymm0, %ymm2, %ymm1
+	vperm2f128	$0x13, %ymm0, %ymm2, %ymm0
+	vaddpd	%ymm0, %ymm1, %ymm0
+
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+	vmulpd	%ymm0, %ymm15, %ymm0
+
+	// beta
+	vmovupd		0(%r11), %ymm14
+	vaddpd		%ymm0, %ymm14, %ymm0
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+	
+#if defined(OS_LINUX)
+	.size	inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store 
+//
+// input arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10  <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_6_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0, 0(%r10)
+	vmovupd %xmm1, 32(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+//                             rdi    rsi              rdx              rcx        r8       r9           rsp+8        rsp+16          rsp+24       rsp_32       rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemv_nt_6_lib4
+	.type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemv_nt_6_lib4
+	.def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha_n
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG6, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+	vbroadcastsd 32(%r10), %ymm10
+	vmulpd		%ymm15, %ymm10, %ymm10
+	vbroadcastsd 40(%r10), %ymm11
+	vmulpd		%ymm15, %ymm11, %ymm11
+
+
+	// inner kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG5, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG7, %r13  // x_t
+	movq	ARG10, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+	// inner blend n scale ab
+
+	movq	ARG3, %r10 // alpha_t
+	movq	ARG8, %r11   // beta_t
+	movq	ARG9, %r12   // y_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG11, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_6_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_6_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+//                            rdi    rsi            rdx        rcx      r8           r9           rsp+8        rsp+16 
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsymv_l_4_lib4
+	.type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsymv_l_4_lib4
+	.def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers y_t
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+	// initialize x_n
+	movq	ARG2, %r10 // alpha
+	vbroadcastsd 0(%r10), %ymm15
+
+	movq	ARG5, %r10 // x_n
+
+	vbroadcastsd 0(%r10), %ymm6
+	vmulpd		%ymm15, %ymm6, %ymm6
+	vbroadcastsd 8(%r10), %ymm7
+	vmulpd		%ymm15, %ymm7, %ymm7
+	vbroadcastsd 16(%r10), %ymm8
+	vmulpd		%ymm15, %ymm8, %ymm8
+	vbroadcastsd 24(%r10), %ymm9
+	vmulpd		%ymm15, %ymm9, %ymm9
+
+
+	// inner edge dsyrk & kernel dgemv nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+//	movslq	%r12d, %r12
+	movq	ARG6, %r13  // x_t
+	movq	ARG7, %r14  // z_n
+
+#if MACRO_LEVEL>=2
+	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+	// call inner blend n scale ab
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11   // z_t
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // z_t 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx2/kernel_sgemm_16x4_lib8.S b/kernel/avx2/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..857fb11
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,6811 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			-32(%r11), %ymm10 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			-32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	addq	$128, %r13
+
+	// unroll 0
+	vbroadcastss	-32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			-32(%r11), %ymm10 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			-32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	addq	$128, %r13
+
+	// unroll 0
+	vbroadcastss	-32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // a
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	0(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vbroadcastss	4(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	subl	$1, %r10d
+	vbroadcastss	8(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	addq	$32, %r11
+	vbroadcastss	12(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vfmadd231ps		%ymm8, %ymm14, %ymm0
+	vfmadd231ps		%ymm9, %ymm14, %ymm4
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmovaps			32(%r11), %ymm10 // A0
+	vfmadd231ps		%ymm8, %ymm14, %ymm1
+	vfmadd231ps		%ymm9, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+
+	vmovaps			32(%r15), %ymm11 // A1
+	vfmadd231ps		%ymm8, %ymm14, %ymm2
+	vfmadd231ps		%ymm9, %ymm14, %ymm6
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+
+	vfmadd231ps		%ymm8, %ymm14, %ymm3
+	vfmadd231ps		%ymm9, %ymm14, %ymm7
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+
+
+	// unroll 1
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm14, %ymm0
+	vfmadd231ps		%ymm11, %ymm14, %ymm4
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+
+	vmovaps			64(%r11), %ymm8 // A0
+	vfmadd231ps		%ymm10, %ymm14, %ymm1
+	vfmadd231ps		%ymm11, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+
+	vmovaps			64(%r15), %ymm9 // A1
+	vfmadd231ps		%ymm10, %ymm14, %ymm2
+	vfmadd231ps		%ymm11, %ymm14, %ymm6
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+
+	vfmadd231ps		%ymm10, %ymm14, %ymm3
+	vfmadd231ps		%ymm11, %ymm14, %ymm7
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+
+	// unroll 2
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vfmadd231ps		%ymm8, %ymm14, %ymm0
+	vfmadd231ps		%ymm9, %ymm14, %ymm4
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r13
+	vmovaps			96(%r11), %ymm10 // A0
+	vfmadd231ps		%ymm8, %ymm14, %ymm1
+	vfmadd231ps		%ymm9, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r11
+	vmovaps			96(%r15), %ymm11 // A1
+	vfmadd231ps		%ymm8, %ymm14, %ymm2
+	vfmadd231ps		%ymm9, %ymm14, %ymm6
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r15
+	vfmadd231ps		%ymm8, %ymm14, %ymm3
+	vfmadd231ps		%ymm9, %ymm14, %ymm7
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+
+
+	// unroll 3
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm14, %ymm0
+	vfmadd231ps		%ymm11, %ymm14, %ymm4
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+
+	vmovaps			0(%r11), %ymm8 // A0
+	vfmadd231ps		%ymm10, %ymm14, %ymm1
+	vfmadd231ps		%ymm11, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+
+	vmovaps			0(%r15), %ymm9 // A1
+	vfmadd231ps		%ymm10, %ymm14, %ymm2
+	vfmadd231ps		%ymm11, %ymm14, %ymm6
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+
+	vfmadd231ps		%ymm10, %ymm14, %ymm3
+	vfmadd231ps		%ymm11, %ymm14, %ymm7
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastf128	32(%r13), %ymm13 // B
+	vfmadd231ps		%ymm8, %ymm14, %ymm0
+	vfmadd231ps		%ymm9, %ymm14, %ymm4
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmovaps			32(%r11), %ymm10 // A0
+	vfmadd231ps		%ymm8, %ymm14, %ymm1
+	vfmadd231ps		%ymm9, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+
+	vmovaps			32(%r15), %ymm11 // A1
+	vfmadd231ps		%ymm8, %ymm14, %ymm2
+	vfmadd231ps		%ymm9, %ymm14, %ymm6
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+
+	vfmadd231ps		%ymm8, %ymm14, %ymm3
+	vfmadd231ps		%ymm9, %ymm14, %ymm7
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+
+
+	// unroll 1
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm14, %ymm0
+	vfmadd231ps		%ymm11, %ymm14, %ymm4
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+
+	vmovaps			64(%r11), %ymm8 // A0
+	vfmadd231ps		%ymm10, %ymm14, %ymm1
+	vfmadd231ps		%ymm11, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+
+	vmovaps			64(%r15), %ymm9 // A1
+	vfmadd231ps		%ymm10, %ymm14, %ymm2
+	vfmadd231ps		%ymm11, %ymm14, %ymm6
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+
+	vfmadd231ps		%ymm10, %ymm14, %ymm3
+	vfmadd231ps		%ymm11, %ymm14, %ymm7
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+
+	// unroll 2
+	vbroadcastf128	96(%r13), %ymm13 // B
+	vfmadd231ps		%ymm8, %ymm14, %ymm0
+	vfmadd231ps		%ymm9, %ymm14, %ymm4
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r13
+	vmovaps			96(%r11), %ymm10 // A0
+	vfmadd231ps		%ymm8, %ymm14, %ymm1
+	vfmadd231ps		%ymm9, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r11
+	vmovaps			96(%r15), %ymm11 // A1
+	vfmadd231ps		%ymm8, %ymm14, %ymm2
+	vfmadd231ps		%ymm9, %ymm14, %ymm6
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+
+	addq	$128, %r15
+	vfmadd231ps		%ymm8, %ymm14, %ymm3
+	vfmadd231ps		%ymm9, %ymm14, %ymm7
+	vshufps			$0x00, %ymm13, %ymm13, %ymm14
+
+
+	// unroll 3
+//	vbroadcastf128	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm14, %ymm0
+	vfmadd231ps		%ymm11, %ymm14, %ymm4
+	vshufps			$0x55, %ymm13, %ymm13, %ymm14
+
+//	vmovaps			0(%r11), %ymm8 // A0
+	vfmadd231ps		%ymm10, %ymm14, %ymm1
+	vfmadd231ps		%ymm11, %ymm14, %ymm5
+	vshufps			$0xaa, %ymm13, %ymm13, %ymm14
+
+//	vmovaps			0(%r15), %ymm9 // A1
+	vfmadd231ps		%ymm10, %ymm14, %ymm2
+	vfmadd231ps		%ymm11, %ymm14, %ymm6
+	vshufps			$0xff, %ymm13, %ymm13, %ymm14
+
+	vfmadd231ps		%ymm10, %ymm14, %ymm3
+	vfmadd231ps		%ymm11, %ymm14, %ymm7
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm8 // A0
+	vmovaps			0(%r15), %ymm9 // A1
+	vshufps			$0x00, %ymm12, %ymm12, %ymm14
+	vfmadd231ps		%ymm8, %ymm14, %ymm0
+	vfmadd231ps		%ymm9, %ymm14, %ymm4
+
+	vshufps			$0x55, %ymm12, %ymm12, %ymm14
+	vfmadd231ps		%ymm8, %ymm14, %ymm1
+	vfmadd231ps		%ymm9, %ymm14, %ymm5
+
+	vshufps			$0xaa, %ymm12, %ymm12, %ymm14
+	vfmadd231ps		%ymm8, %ymm14, %ymm2
+	vfmadd231ps		%ymm9, %ymm14, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r13
+	addq	$32, %r15
+
+	vshufps			$0xff, %ymm12, %ymm12, %ymm14
+	vfmadd231ps		%ymm8, %ymm14, %ymm3
+	vfmadd231ps		%ymm9, %ymm14, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm0
+	vfnmadd231ps	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm1
+	vfnmadd231ps	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm2
+	vfnmadd231ps	%ymm11, %ymm12, %ymm6
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm3
+	vfnmadd231ps	%ymm11, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovapd			-32(%r11), %ymm10 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vmovapd			-32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	addq	$128, %r13
+
+	// unroll 0
+	vbroadcastss	-32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm0
+	vfnmadd231ps	%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm1
+	vfnmadd231ps	%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm2
+	vfnmadd231ps	%ymm11, %ymm12, %ymm6
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm3
+	vfnmadd231ps	%ymm11, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm0
+	vfnmadd231ps	%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm1
+	vfnmadd231ps	%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm2
+	vfnmadd231ps	%ymm11, %ymm12, %ymm6
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm3
+	vfnmadd231ps	%ymm11, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovapd			-32(%r11), %ymm10 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vmovapd			-32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	addq	$128, %r13
+
+	// unroll 0
+	vbroadcastss	-32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm0
+	vfnmadd231ps	%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm1
+	vfnmadd231ps	%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm2
+	vfnmadd231ps	%ymm11, %ymm12, %ymm6
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm10, %ymm12, %ymm3
+	vfnmadd231ps	%ymm11, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // a
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	0(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vbroadcastss	4(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	subl	$1, %r10d
+	vbroadcastss	8(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	addq	$32, %r11
+	vbroadcastss	12(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+
+	cmpl	$8, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r13, %r14, 1) // software prefetch
+	prefetcht0	64(%r13, %r14, 1) // software prefetch
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	-4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	%r14, %r13
+
+	cmpl	$8, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$7, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			32(%r11), %ymm10 // A
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			32(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	subl	$8, %r10d
+
+	// unroll 1
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	100(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 2
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovapd			96(%r11), %ymm10 // A
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vmovapd			96(%r11, %r12, 1), %ymm11 // A
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	104(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+
+	// unroll 3
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			128(%r11), %ymm13 // A
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			128(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	108(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 4
+	vbroadcastss	16(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			160(%r11), %ymm13 // A
+	vbroadcastss	48(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			160(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	80(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	112(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 5
+	vbroadcastss	20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			192(%r11), %ymm13 // A
+	vbroadcastss	52(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			192(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	84(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	116(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+
+	// unroll 6
+	vbroadcastss	24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+	vmovapd			224(%r11), %ymm13 // A
+	vbroadcastss	56(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+	vmovapd			224(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	88(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	120(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vbroadcastss	28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm0
+	vfmadd231ps		%ymm11, %ymm12, %ymm4
+//	vmovapd			0(%r11), %ymm13 // A
+	vbroadcastss	60(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm1
+	vfmadd231ps		%ymm11, %ymm12, %ymm5
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vbroadcastss	92(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm2
+	vfmadd231ps		%ymm11, %ymm12, %ymm6
+	vbroadcastss	124(%r13), %ymm12 // B
+	vfmadd231ps		%ymm10, %ymm12, %ymm3
+	vfmadd231ps		%ymm11, %ymm12, %ymm7
+	addq	%r14, %r13
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r15d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r15d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r15d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r13, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r15d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm2
+	vfmadd231ps		%ymm9, %ymm12, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r15d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	8(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	40(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+	vbroadcastss	72(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm2
+	vfmadd231ps		%ymm9, %ymm12, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movl		$0, %r15d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r15d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+	vbroadcastss	64(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm2
+	vfmadd231ps		%ymm9, %ymm12, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r15d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	32(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vmovaps			0(%r11, %r12, 1), %ymm9
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm0
+	vfmadd231ps		%ymm9, %ymm12, %ymm4
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm1
+	vfmadd231ps		%ymm9, %ymm12, %ymm5
+	vbroadcastss	68(%rbx), %ymm12
+	vfmadd231ps		%ymm8, %ymm12, %ymm2
+	vfmadd231ps		%ymm9, %ymm12, %ymm6
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vbroadcastss	8(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vbroadcastss	12(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vbroadcastss	44(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss			%xmm0, %xmm0, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+
+
+	vpermilps		$0x55, %xmm1, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+
+
+	vpermilps		$0xaa, %xmm2, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+
+
+	vpermilps		$0xff, %xmm3, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vextractf128	$0x1, %ymm0, %xmm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm1, %xmm13
+	vpermilps		$0x55, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vpermilps		$0xaa, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilps		$0xff, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	movq	%r13, %rax // C1 <- C0
+	addq	%r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm3
+
+	vmovaps		0(%rax), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm4
+	vmovaps		32(%rax), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm5
+	vmovaps		64(%rax), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm6
+	vmovaps		96(%rax), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // C1 <- C0
+	addq	%r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+	
+	movq	%r11, %rax // C1 <- C0
+	addq	%r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	vmovaps		0(%rax), %ymm14
+	vaddps		%ymm4, %ymm14, %ymm4
+	vmovaps		32(%rax), %ymm14
+	vaddps		%ymm5, %ymm14, %ymm5
+	vmovaps		64(%rax), %ymm14
+	vaddps		%ymm6, %ymm14, %ymm6
+	vmovaps		96(%rax), %ymm14
+	vaddps		%ymm7, %ymm14, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r15)
+	vmovaps 	%ymm5, 32(%r15)
+	vmovaps 	%ymm6, 64(%r15)
+	vmovaps 	%ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			7f // end
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			7f // end
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			7f // end
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+	jmp		0f
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute D1
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	cmpl		$2, %r15d
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%rbx)
+	jl			7f // end
+	cmpl		$3, %r15d
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%rbx)
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%rbx)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%rbx)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbp // D1
+	addq	%r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+	
+	vmovaps		32(%r10), %ymm12
+	vmovaps		64(%r10), %ymm13
+	vmovaps		96(%r10), %ymm14
+
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vblendps	$0x03, %ymm13, %ymm2, %ymm2
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+	
+	vmovaps		0(%r10), %ymm12
+	vmovaps		32(%r10), %ymm13
+	vmovaps		64(%r10), %ymm14
+	vmovaps		96(%r10), %ymm15
+
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vblendps	$0x1f, %ymm13, %ymm1, %ymm1
+	vblendps	$0x3f, %ymm14, %ymm2, %ymm2
+	vblendps	$0x7f, %ymm15, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		0(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r10)
+	vmaskmovps	%ymm4, %ymm15, 0(%r10, %r11, 1)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmaskmovps	%ymm5, %ymm15, 32(%r10, %r11, 1)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmaskmovps	%ymm6, %ymm15, 64(%r10, %r11, 1)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmaskmovps	%ymm7, %ymm15, 96(%r10, %r11, 1)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC01(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmovaps		0(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmaskmovps	%ymm4, %ymm15,  0(%r11, %r12, 1)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmaskmovps	%ymm5, %ymm15, 32(%r11, %r12, 1)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmaskmovps	%ymm6, %ymm15, 64(%r11, %r12, 1)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmaskmovps	%ymm7, %ymm15, 96(%r11, %r12, 1)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                                1      2             3         4        5         6            7         8        9         10
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_lib8
+	.type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_lib8
+	.def kernel_sgemm_nt_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_vs_lib8
+	.def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9           rsp+8        rsp+16    rsp+24   rsp+32       rsp+40    rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_16x4_gen_lib8
+	.def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                1      2             3         4        5            6         7        8            9         10       11        12
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_lib8
+	.def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8            9         10       11        12       13      14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_vs_lib8
+	.def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12 // km
+	movq	ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9        rsp+8    rsp+16       rsp+24    rsp+32    rsp+40   rsp+48    rsp+56    rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_16x4_gen_lib8
+	.def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+//                                       rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32 
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                          rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32             rsp+40  rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12 // m1 
+	movq	ARG12, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                             1       2          3         4          5       6          7         8          9         10       11        12       13        14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+//                                                1       2          3         4          5       6          7         8          9         10       11        12       13        14                 15      16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_lib8
+	.def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_lib8
+	.def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_lib8
+	.type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_16x4_lib8
+	.def kernel_ssyrk_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_16x4_vs_lib8
+	.def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_lib8
+	.type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_12x4_lib8
+	.def kernel_ssyrk_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L12X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       12      13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_12x4_vs_lib8
+	.def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps		%ymm0, %ymm0, %ymm0
+	vmovaps		%ymm0, %ymm1
+	vmovaps		%ymm0, %ymm2
+	vmovaps		%ymm0, %ymm3
+	vmovaps		%ymm0, %ymm4
+	vmovaps		%ymm0, %ymm5
+	vmovaps		%ymm0, %ymm6
+	vmovaps		%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8         9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_lib8
+	.def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4        5            6         7        8         9        10      11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_vs_lib8
+	.def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+//                                       1      2             3         4        5            6         7        8            9         10       11      12      13      14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_16x4_gen_lib8
+	.def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // offsetD
+	movq	ARG9, %r11 // D
+	movq	ARG10, %r12 // sdd
+	sall	$5, %r12d // 4*sdd*sizeof(double)
+	movq	ARG11, %r13 // m0
+	movq	ARG12, %r14 // m1
+	movq	ARG13, %r15 // n0
+	movq	ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_24x4_lib8.S b/kernel/avx2/kernel_sgemm_24x4_lib8.S
new file mode 100644
index 0000000..b3a027f
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_24x4_lib8.S
@@ -0,0 +1,7734 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_24x4_lib8, @function
+inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_24x4_lib8:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+	vmovaps 		0(%r11, %r12, 2), %ymm15 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	subl	$4, %r10d
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			-32(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			-32(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			-32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	addq	$128, %r13
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	subl	$4, %r10d
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovapd			-32(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovapd			-32(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vmovapd			-32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	addq	$128, %r13
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // a
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+	vbroadcastss	0(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	4(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	subl	$1, %r10d
+	vbroadcastss	8(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	addq	$32, %r11
+	vbroadcastss	12(%r13), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	movq	%r11, %r15 // A1 <- A0
+	addq	%r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+	movq	%r15, %rax // A2 <- A1
+	addq	%r12, %rax // A2 <- A1 + 4*sda*sizeof(float)
+
+	// preload
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%r11), %ymm13 // A0
+	vmovaps			0(%r15), %ymm14 // A1
+	vmovaps			0(%rax), %ymm15 // A2
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			32(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			32(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	32(%r13), %ymm12 // B
+	vmovaps			32(%rax), %ymm15 // A2
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			64(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			64(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vmovaps			64(%rax), %ymm15 // A2
+
+
+	// unroll 2
+	subl	$4, %r10d
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			96(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			96(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	96(%r13), %ymm12 // B
+	vmovaps			96(%rax), %ymm15 // A2
+
+
+	// unroll 3
+	addq	$128, %r13
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	addq	$128, %r11
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	addq	$128, %r15
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	addq	$128, %rax
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			0(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			0(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	0(%r13), %ymm12 // B
+	vmovaps			0(%rax), %ymm15 // A2
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			32(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			32(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	32(%r13), %ymm12 // B
+	vmovaps			32(%rax), %ymm15 // A2
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			64(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			64(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	64(%r13), %ymm12 // B
+	vmovaps			64(%rax), %ymm15 // A2
+
+
+	// unroll 2
+	subl	$4, %r10d
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vmovaps			96(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vmovaps			96(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	vbroadcastf128	96(%r13), %ymm12 // B
+	vmovaps			96(%rax), %ymm15 // A2
+
+
+	// unroll 3
+	addq	$128, %r13
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	addq	$128, %r11
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vpermilps		$0x4e, %ymm12, %ymm12
+
+	addq	$128, %r15
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	vpermilps		$0xb1, %ymm12, %ymm12
+
+	addq	$128, %rax
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+//	vmovaps			0(%r11), %ymm13 // A0
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+//	vmovaps			0(%r15), %ymm14 // A1
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+//	vbroadcastf128	0(%r13), %ymm12 // B
+//	vmovaps			0(%rax), %ymm15 // A2
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	32(%r13), %ymm12 // B
+	vmovaps			32(%r11), %ymm13 // A0
+	vmovaps			32(%r15), %ymm14 // A1
+	vmovaps			32(%rax), %ymm15 // A2
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	subl	$1, %r10d
+
+	vpermilps		$0xb1, %ymm12, %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	addq	$32, %r11
+
+	vpermilps		$0x4e, %ymm12, %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+	addq	$32, %r13
+
+	vpermilps		$0xb1, %ymm12, %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	vfmadd231ps		%ymm15, %ymm12, %ymm11
+	addq	$32, %r15
+
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_24x4_lib8, .-inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_24x4_lib8, @function
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+	vmovaps 		0(%r11, %r12, 1), %ymm14 // A
+	vmovaps 		0(%r11, %r12, 2), %ymm15 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	subl	$4, %r10d
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			-32(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			-32(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			-32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	addq	$128, %r13
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			0(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	subl	$4, %r10d
+	vbroadcastss	4(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	8(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	12(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			32(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			32(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	32(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	40(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	44(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			64(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			64(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			64(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	64(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	addq	$128, %r11
+	vbroadcastss	68(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	76(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vmovapd			-32(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vmovapd			-32(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	vmovapd			-32(%r11, %r12, 2), %ymm15 // A
+
+	// unroll 0
+	vbroadcastss	96(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	addq	$128, %r13
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	vbroadcastss	-28(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	vbroadcastss	-24(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	vbroadcastss	-20(%r13), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+//	vmovapd			0(%r11), %ymm13 // A
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+//	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+//	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd			0(%r11), %ymm13 // a
+	vmovapd			0(%r11, %r12, 1), %ymm14 // A
+	vmovapd			0(%r11, %r12, 2), %ymm15 // A
+	vbroadcastss	0(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vfnmadd231ps	%ymm15, %ymm12, %ymm8
+	vbroadcastss	4(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vfnmadd231ps	%ymm15, %ymm12, %ymm9
+	subl	$1, %r10d
+	vbroadcastss	8(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vfnmadd231ps	%ymm15, %ymm12, %ymm10
+	addq	$32, %r11
+	vbroadcastss	12(%r13), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	vfnmadd231ps	%ymm15, %ymm12, %ymm11
+	addq	$32, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_24x4_lib8, .-inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- 4*sda*sizeof(double)
+// r13   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14   <- 4*sdb*sizeof(double)
+// r15   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_24x4_lib8, @function
+inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r13, %r14, 1) // software prefetch
+	prefetcht0	64(%r13, %r14, 1) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			0(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A0
+	vmovaps			32(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			32(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	4(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	36(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	68(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	100(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A0
+	vmovaps			64(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			64(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	8(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	40(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	72(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	104(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A0
+	vmovaps			96(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			96(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	12(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	44(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	76(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	108(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A0
+	vmovaps			128(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			128(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	16(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	48(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	80(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	112(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A0
+	vmovaps			160(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			160(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	20(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	52(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	84(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	116(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A0
+	vmovaps			192(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			192(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	24(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	56(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	subl	$8, %r10d
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	88(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vbroadcastss	120(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+
+	// unroll 7
+	vmovaps			224(%r11), %ymm12 // A0
+	vmovaps			224(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			224(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	28(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	60(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	addq	$256, %r11
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	92(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	124(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+	addq	%r14, %r13
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			0(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r13
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_24x4_lib8, .-inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_24x4_lib8, @function
+inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r15d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r15d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r15d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A0
+	vmovaps			0(%r11, %r12, 1), %ymm13 // A1
+	vmovaps			0(%r11, %r12, 2), %ymm14 // A2
+	vbroadcastss	0(%r13), %ymm15 // B[0]
+	vfmadd231ps		%ymm12, %ymm15, %ymm0
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vfmadd231ps		%ymm14, %ymm15, %ymm8
+	vbroadcastss	32(%r13), %ymm15 // B[1]
+	vfmadd231ps		%ymm12, %ymm15, %ymm1
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vfmadd231ps		%ymm14, %ymm15, %ymm9
+	vbroadcastss	64(%r13), %ymm15 // B[2]
+	vfmadd231ps		%ymm12, %ymm15, %ymm2
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vfmadd231ps		%ymm14, %ymm15, %ymm10
+	vbroadcastss	96(%r13), %ymm15 // B[3]
+	vfmadd231ps		%ymm12, %ymm15, %ymm3
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vfmadd231ps		%ymm14, %ymm15, %ymm11
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r13 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r14, %r13
+	subq			$32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_24x4_lib8, .-inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- bs*sda*sizeof(double)
+// r13   <- B-offB+bs*sdb*sizeof(double)
+// r14   <- bs*sdb*sizeof(double)
+// r15   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_24x4_lib8, @function
+inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_24x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r15d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r13, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r15d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	8(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	40(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r15d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	8(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	40(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	72(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movl		$0, %r15d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r15d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	32(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	64(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r15d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r14, %r13 // B+8*sdb*sizeof(float)
+	movq		%r13, %rbx // B
+	movl		$0, %r15d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	0(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	32(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm13
+	vmovaps			0(%r11, %r12, 1), %ymm14
+	vmovaps			0(%r11, %r12, 2), %ymm15
+	vbroadcastss	4(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vfmadd231ps		%ymm15, %ymm12, %ymm8
+	vbroadcastss	36(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vfmadd231ps		%ymm15, %ymm12, %ymm9
+	vbroadcastss	68(%rbx), %ymm12
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vfmadd231ps		%ymm15, %ymm12, %ymm10
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r15d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_24x4_lib8, .-inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_24x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	vmulps			%ymm8, %ymm13, %ymm8
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vfnmadd231ps	%ymm8, %ymm13, %ymm9
+	vbroadcastss	8(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vfnmadd231ps	%ymm8, %ymm13, %ymm10
+	vbroadcastss	12(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+	vfnmadd231ps	%ymm8, %ymm13, %ymm11
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	vmulps			%ymm9, %ymm13, %ymm9
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vfnmadd231ps	%ymm9, %ymm13, %ymm10
+	vbroadcastss	44(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+	vfnmadd231ps	%ymm9, %ymm13, %ymm11
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	vmulps			%ymm10, %ymm13, %ymm10
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+	vfnmadd231ps	%ymm10, %ymm13, %ymm11
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+	vmulps			%ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_24x4_vs_lib8, .-inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_24x4_vs_lib8, @function
+inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_24x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss			%xmm0, %xmm0, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	vmulps			%ymm8, %ymm13, %ymm8
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm0, %ymm0, %ymm15
+	vpermilps		$0x55, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vfnmadd231ps	%ymm8, %ymm13, %ymm9
+	vpermilps		$0xaa, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vfnmadd231ps	%ymm8, %ymm13, %ymm10
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+	vfnmadd231ps	%ymm8, %ymm13, %ymm11
+
+
+	vpermilps		$0x55, %xmm1, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	vmulps			%ymm9, %ymm13, %ymm9
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm1, %ymm1, %ymm15
+	vpermilps		$0xaa, %ymm15, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vfnmadd231ps	%ymm9, %ymm13, %ymm10
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+	vfnmadd231ps	%ymm9, %ymm13, %ymm11
+
+
+	vpermilps		$0xaa, %xmm2, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	vmulps			%ymm10, %ymm13, %ymm10
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x00, %ymm2, %ymm2, %ymm15
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+	vfnmadd231ps	%ymm10, %ymm13, %ymm11
+
+
+	vpermilps		$0xff, %xmm3, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+	vmulps			%ymm11, %ymm13, %ymm11
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_24x4_vs_lib8, .-inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_20x4_vs_lib8, @function
+inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_20x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vextractf128	$0x1, %ymm0, %xmm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vmulps			%ymm4, %ymm13, %ymm4
+	vmulps			%ymm8, %ymm13, %ymm8
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm15
+	vpermilps		$0x55, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vfnmadd231ps	%ymm8, %ymm13, %ymm9
+	vpermilps		$0xaa, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vfnmadd231ps	%ymm8, %ymm13, %ymm10
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+	vfnmadd231ps	%ymm8, %ymm13, %ymm11
+
+
+	vextractf128	$0x1, %ymm1, %xmm13
+	vpermilps		$0x55, %xmm13, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vmulps			%ymm5, %ymm13, %ymm5
+	vmulps			%ymm9, %ymm13, %ymm9
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm15
+	vpermilps		$0xaa, %ymm15, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vfnmadd231ps	%ymm9, %ymm13, %ymm10
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+	vfnmadd231ps	%ymm9, %ymm13, %ymm11
+
+
+	vextractf128	$0x1, %ymm2, %xmm13
+	vpermilps		$0xaa, %xmm13, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vmulps			%ymm6, %ymm13, %ymm6
+	vmulps			%ymm10, %ymm13, %ymm10
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm15
+	vpermilps		$0xff, %ymm15, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+	vfnmadd231ps	%ymm10, %ymm13, %ymm11
+
+
+	vextractf128	$0x1, %ymm3, %xmm13
+	vpermilps		$0xff, %xmm13, %xmm13
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 12(%r10)
+	vpermilps		$0x00, %xmm13, %xmm13
+	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vmulps			%ymm7, %ymm13, %ymm7
+	vmulps			%ymm11, %ymm13, %ymm11
+
+	jmp		0f
+
+
+1:
+	vxorps			%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd			%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_20x4_vs_lib8, .-inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_24x4_lib8, @function
+inner_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	vmulps		%ymm4, %ymm15, %ymm4
+	vmulps		%ymm5, %ymm15, %ymm5
+	vmulps		%ymm6, %ymm15, %ymm6
+	vmulps		%ymm7, %ymm15, %ymm7
+
+	vmulps		%ymm8, %ymm15, %ymm8
+	vmulps		%ymm9, %ymm15, %ymm9
+	vmulps		%ymm10, %ymm15, %ymm10
+	vmulps		%ymm11, %ymm15, %ymm11
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r12, %r13, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r12, %r13, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r12, %r13, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r12, %r13, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%r12, %r13, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%r12, %r13, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%r12, %r13, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%r12, %r13, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_24x4_lib8, .-inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_24x4_gen_lib8, @function
+inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	vmulps		%ymm4, %ymm15, %ymm4
+	vmulps		%ymm5, %ymm15, %ymm5
+	vmulps		%ymm6, %ymm15, %ymm6
+	vmulps		%ymm7, %ymm15, %ymm7
+
+	vmulps		%ymm8, %ymm15, %ymm8
+	vmulps		%ymm9, %ymm15, %ymm9
+	vmulps		%ymm10, %ymm15, %ymm10
+	vmulps		%ymm11, %ymm15, %ymm11
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	movq	%r13, %r15 // C1 <- C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+	movq	%r15, %rax // C2 <- C1
+	addq	%r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r12d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r12d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r12d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_24x4_gen_lib8, .-inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_24x4_lib8, @function
+inner_scale_a0_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_24x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm12
+
+	vmulps		%ymm0, %ymm12, %ymm0
+	vmulps		%ymm1, %ymm12, %ymm1
+	vmulps		%ymm2, %ymm12, %ymm2
+	vmulps		%ymm3, %ymm12, %ymm3
+
+	vmulps		%ymm4, %ymm12, %ymm4
+	vmulps		%ymm5, %ymm12, %ymm5
+	vmulps		%ymm6, %ymm12, %ymm6
+	vmulps		%ymm7, %ymm12, %ymm7
+
+	vmulps		%ymm8, %ymm12, %ymm8
+	vmulps		%ymm9, %ymm12, %ymm9
+	vmulps		%ymm10, %ymm12, %ymm10
+	vmulps		%ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_24x4_lib8, .-inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_24x4_lib8, @function
+inner_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_lib8:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovaps		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovaps		LC03(%rip), %ymm14
+#endif
+
+	vmovaps		0(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r10, %r11, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r10, %r11, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r10, %r11, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r10, %r11, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%r10, %r11, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%r10, %r11, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%r10, %r11, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%r10, %r11, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_24x4_lib8, .-inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_24x4_gen_lib8, @function
+inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_gen_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovaps		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovaps		LC03(%rip), %ymm14
+#endif
+
+	vmovaps		0(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r11, %r12, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r11, %r12, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r11, %r12, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r11, %r12, 1), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%r11, %r12, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%r11, %r12, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%r11, %r12, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%r11, %r12, 2), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_24x4_gen_lib8, .-inner_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// r13   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_24x4_lib8, @function
+inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm4, %ymm15, %ymm4
+	vmulps		%ymm5, %ymm15, %ymm5
+	vmulps		%ymm6, %ymm15, %ymm6
+	vmulps		%ymm7, %ymm15, %ymm7
+
+	vblendps	$0xaa, %ymm9, %ymm8, %ymm12
+	vblendps	$0x55, %ymm9, %ymm8, %ymm13
+	vblendps	$0xaa, %ymm11, %ymm10, %ymm14
+	vblendps	$0x55, %ymm11, %ymm10, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm8
+	vblendps	$0x33, %ymm15, %ymm12, %ymm10
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm9
+	vblendps	$0x33, %ymm14, %ymm13, %ymm11
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm8, %ymm15, %ymm8
+	vmulps		%ymm9, %ymm15, %ymm9
+	vmulps		%ymm10, %ymm15, %ymm10
+	vmulps		%ymm11, %ymm15, %ymm11
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	movq	%r12, %r15 // C1 <- C0
+	addq	%r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+	movq	%r15, %rax // C2 <- C1
+	addq	%r13, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_24x4_lib8, .-inner_blend_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_24x4_gen_lib8, @function
+inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm4, %ymm15, %ymm4
+	vmulps		%ymm5, %ymm15, %ymm5
+	vmulps		%ymm6, %ymm15, %ymm6
+	vmulps		%ymm7, %ymm15, %ymm7
+
+	vblendps	$0xaa, %ymm9, %ymm8, %ymm12
+	vblendps	$0x55, %ymm9, %ymm8, %ymm13
+	vblendps	$0xaa, %ymm11, %ymm10, %ymm14
+	vblendps	$0x55, %ymm11, %ymm10, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm8
+	vblendps	$0x33, %ymm15, %ymm12, %ymm10
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm9
+	vblendps	$0x33, %ymm14, %ymm13, %ymm11
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm8, %ymm15, %ymm8
+	vmulps		%ymm9, %ymm15, %ymm9
+	vmulps		%ymm10, %ymm15, %ymm10
+	vmulps		%ymm11, %ymm15, %ymm11
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	movq	%r13, %r15 // C1 <- C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+	movq	%r15, %rax // C2 <- C1
+	addq	%r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+	vmovaps		0(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm8
+	vmovaps		32(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm9
+	vmovaps		64(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm10
+	vmovaps		96(%rax), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm11
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r12d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r12d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r12d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_24x4_gen_lib8, .-inner_blend_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// r11   <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_24x4_lib8, @function
+inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vblendps	$0xaa, %ymm9, %ymm8, %ymm12
+	vblendps	$0x55, %ymm9, %ymm8, %ymm13
+	vblendps	$0xaa, %ymm11, %ymm10, %ymm14
+	vblendps	$0x55, %ymm11, %ymm10, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm8
+	vblendps	$0x33, %ymm15, %ymm12, %ymm10
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm9
+	vblendps	$0x33, %ymm14, %ymm13, %ymm11
+
+	movq	%r10, %r15 // C1 <- C0
+	addq	%r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+	movq	%r15, %rax // C2 <- C1
+	addq	%r11, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm15, %ymm0, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm15, %ymm1, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm15, %ymm2, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm15, %ymm3, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vaddps		%ymm15, %ymm4, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vaddps		%ymm15, %ymm5, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vaddps		%ymm15, %ymm6, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vaddps		%ymm15, %ymm7, %ymm7
+
+	vmovaps		0(%rax), %ymm15
+	vaddps		%ymm15, %ymm8, %ymm8
+	vmovaps		32(%rax), %ymm15
+	vaddps		%ymm15, %ymm9, %ymm9
+	vmovaps		64(%rax), %ymm15
+	vaddps		%ymm15, %ymm10, %ymm10
+	vmovaps		96(%rax), %ymm15
+	vaddps		%ymm15, %ymm11, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_24x4_lib8, .-inner_blend_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_24x4_gen_lib8, @function
+inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vblendps	$0xaa, %ymm9, %ymm8, %ymm12
+	vblendps	$0x55, %ymm9, %ymm8, %ymm13
+	vblendps	$0xaa, %ymm11, %ymm10, %ymm14
+	vblendps	$0x55, %ymm11, %ymm10, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm8
+	vblendps	$0x33, %ymm15, %ymm12, %ymm10
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm9
+	vblendps	$0x33, %ymm14, %ymm13, %ymm11
+
+	movq	%r11, %r15 // C1 <- C0
+	addq	%r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+	movq	%r15, %rax // C2 <- C1
+	addq	%r12, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm15
+	vaddps		%ymm15, %ymm0, %ymm0
+	vmovaps		32(%r11), %ymm15
+	vaddps		%ymm15, %ymm1, %ymm1
+	vmovaps		64(%r11), %ymm15
+	vaddps		%ymm15, %ymm2, %ymm2
+	vmovaps		96(%r11), %ymm15
+	vaddps		%ymm15, %ymm3, %ymm3
+
+	vmovaps		0(%r15), %ymm15
+	vaddps		%ymm15, %ymm4, %ymm4
+	vmovaps		32(%r15), %ymm15
+	vaddps		%ymm15, %ymm5, %ymm5
+	vmovaps		64(%r15), %ymm15
+	vaddps		%ymm15, %ymm6, %ymm6
+	vmovaps		96(%r15), %ymm15
+	vaddps		%ymm15, %ymm7, %ymm7
+
+	vmovaps		0(%rax), %ymm15
+	vaddps		%ymm15, %ymm8, %ymm8
+	vmovaps		32(%rax), %ymm15
+	vaddps		%ymm15, %ymm9, %ymm9
+	vmovaps		64(%rax), %ymm15
+	vaddps		%ymm15, %ymm10, %ymm10
+	vmovaps		96(%rax), %ymm15
+	vaddps		%ymm15, %ymm11, %ymm11
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%rax, %rbx // C1
+	addq	%r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_24x4_gen_lib8, .-inner_blend_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_24x4_lib8, @function
+inner_store_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_lib8:
+#endif
+#endif
+	
+	movq	%r10, %r15 // D1 <- D0
+	addq	%r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+	movq	%r15, %rax // D2 <- D1
+	addq	%r11, %rax // D2 <- D1 + 4*sdd*sizeof(double)
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r15)
+	vmovaps 	%ymm5, 32(%r15)
+	vmovaps 	%ymm6, 64(%r15)
+	vmovaps 	%ymm7, 96(%r15)
+
+	vmovaps 	%ymm8,  0(%rax)
+	vmovaps 	%ymm9, 32(%rax)
+	vmovaps 	%ymm10, 64(%rax)
+	vmovaps 	%ymm11, 96(%rax)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_24x4_lib8, .-inner_store_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_24X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_24x4_vs_lib8, @function
+inner_store_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0,  0(%r10)
+	vmovaps		%ymm4, 0(%r10, %r11, 1)
+	vmaskmovps	%ymm8, %ymm15,  0(%r10, %r11, 2)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		%ymm1, 32(%r10)
+	vmovaps		%ymm5, 32(%r10, %r11, 1)
+	vmaskmovps	%ymm9, %ymm15, 32(%r10, %r11, 2)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		%ymm2, 64(%r10)
+	vmovaps		%ymm6, 64(%r10, %r11, 1)
+	vmaskmovps	%ymm10, %ymm15, 64(%r10, %r11, 2)
+	je			0f // end
+	vmovaps		%ymm3, 96(%r10)
+	vmovaps		%ymm7, 96(%r10, %r11, 1)
+	vmaskmovps	%ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_24x4_vs_lib8, .-inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_24x4_gen_lib8, @function
+inner_store_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	vmovaps		%ymm11, %ymm10
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	addq		$32, %r11
+
+0:
+
+	// compute D1
+	movq	%r11, %rbx // D1
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+	movq	%rbx, %rbp // D2
+	addq	%r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	cmpl		$2, %r15d
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmovaps		%ymm4, 0(%rbx)
+	vmaskmovps	%ymm8, %ymm15,  0(%rbp)
+	jl			7f // end
+	cmpl		$3, %r15d
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmovaps		%ymm5, 32(%rbx)
+	vmaskmovps	%ymm9, %ymm15, 32(%rbp)
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmovaps		%ymm6, 64(%rbx)
+	vmaskmovps	%ymm10, %ymm15, 64(%rbp)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmovaps		%ymm7, 96(%rbx)
+	vmaskmovps	%ymm11, %ymm15, 96(%rbp)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+//	movq	%r11, %rbp // D1
+//	addq	%r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+	addq	%rbp, %r12 // D3 <- D2 + 4*sdd*sizeof(float)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_24x4_gen_lib8, .-inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_20X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_20x4_lib8, @function
+inner_store_l_20x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_20x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_lib8:
+#endif
+#endif
+	
+	vmovaps		0(%r10), %ymm12
+	vmovaps		32(%r10), %ymm13
+	vmovaps		64(%r10), %ymm14
+	vmovaps		96(%r10), %ymm15
+
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vblendps	$0x1f, %ymm13, %ymm1, %ymm1
+	vblendps	$0x3f, %ymm14, %ymm2, %ymm2
+	vblendps	$0x7f, %ymm15, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+	vmovaps 	%ymm8,  0(%r10, %r11, 2)
+	vmovaps 	%ymm9, 32(%r10, %r11, 2)
+	vmovaps 	%ymm10, 64(%r10, %r11, 2)
+	vmovaps 	%ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_20x4_lib8, .-inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_24X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_24x4_lib8, @function
+inner_store_l_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_lib8:
+#endif
+#endif
+	
+	vmovaps		32(%r10), %ymm12
+	vmovaps		64(%r10), %ymm13
+	vmovaps		96(%r10), %ymm14
+
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vblendps	$0x03, %ymm13, %ymm2, %ymm2
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		%ymm3, 96(%r10)
+
+	vmovaps 	%ymm4,  0(%r10, %r11, 1)
+	vmovaps 	%ymm5, 32(%r10, %r11, 1)
+	vmovaps 	%ymm6, 64(%r10, %r11, 1)
+	vmovaps 	%ymm7, 96(%r10, %r11, 1)
+
+	vmovaps 	%ymm8,  0(%r10, %r11, 2)
+	vmovaps 	%ymm9, 32(%r10, %r11, 2)
+	vmovaps 	%ymm10, 64(%r10, %r11, 2)
+	vmovaps 	%ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_24x4_lib8, .-inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_20X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_20x4_vs_lib8, @function
+inner_store_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		0(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmovaps		%ymm0, 0(%r10)
+	vmovaps		%ymm4, 0(%r10, %r11, 1)
+	vmaskmovps	%ymm8, %ymm15, 0(%r10, %r11, 2)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmovaps		%ymm5, 32(%r10, %r11, 1)
+	vmaskmovps	%ymm9, %ymm15, 32(%r10, %r11, 2)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmovaps		%ymm6, 64(%r10, %r11, 1)
+	vmaskmovps	%ymm10, %ymm15, 64(%r10, %r11, 2)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmovaps		%ymm7, 96(%r10, %r11, 1)
+	vmaskmovps	%ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_20x4_vs_lib8, .-inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- 4*sdd*sizeof(double)
+// r12  <- km
+// r13  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_24X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_24x4_vs_lib8, @function
+inner_store_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	vmovaps		%ymm0, 0(%r10)
+	vmovaps		%ymm4, 0(%r10, %r11, 1)
+	vmaskmovps	%ymm8, %ymm15, 0(%r10, %r11, 2)
+	cmpl		$2, %r13d
+	jl			0f // end
+	vmovaps		32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmovaps		%ymm1, 32(%r10)
+	vmovaps		%ymm5, 32(%r10, %r11, 1)
+	vmaskmovps	%ymm9, %ymm15, 32(%r10, %r11, 2)
+	cmpl		$3, %r13d
+	jl			0f // end
+	vmovaps		64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmovaps		%ymm2, 64(%r10)
+	vmovaps		%ymm6, 64(%r10, %r11, 1)
+	vmaskmovps	%ymm10, %ymm15, 64(%r10, %r11, 2)
+	je			0f // end
+	vmovaps		96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmovaps		%ymm3, 96(%r10)
+	vmovaps		%ymm7, 96(%r10, %r11, 1)
+	vmaskmovps	%ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_24x4_vs_lib8, .-inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_20X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_20x4_gen_lib8, @function
+inner_store_l_20x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_20x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	vmovaps		%ymm11, %ymm10
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmovaps		0(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm0, %ymm0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmovaps		%ymm4, 0(%r11, %r12, 1)
+	vmaskmovps	%ymm8, %ymm15,  0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmovaps		%ymm5, 32(%r11, %r12, 1)
+	vmaskmovps	%ymm9, %ymm15, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmovaps		%ymm6, 64(%r11, %r12, 1)
+	vmaskmovps	%ymm10, %ymm15, 64(%r11, %r12, 2)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmovaps		%ymm7, 96(%r11, %r12, 1)
+	vmaskmovps	%ymm11, %ymm15, 96(%r11, %r12, 2)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_20x4_gen_lib8, .-inner_store_l_20x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_24X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_24x4_gen_lib8, @function
+inner_store_l_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+	vmovups		.LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+	vmovups		LC02(%rip), %ymm13
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm13, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm7, %ymm6
+	vmovaps		%ymm11, %ymm10
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm10, %ymm9
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm9, %ymm8
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r11)
+	vmovaps		%ymm4, 0(%r11, %r12, 1)
+	vmaskmovps	%ymm8, %ymm15,  0(%r11, %r12, 2)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps		32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm14, 32(%r11)
+	vmovaps		%ymm5, 32(%r11, %r12, 1)
+	vmaskmovps	%ymm9, %ymm15, 32(%r11, %r12, 2)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps		64(%r11), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm14, 64(%r11)
+	vmovaps		%ymm6, 64(%r11, %r12, 1)
+	vmaskmovps	%ymm10, %ymm15, 64(%r11, %r12, 2)
+	je			7f // end
+	vmovaps		96(%r11), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm14, 96(%r11)
+	vmovaps		%ymm7, 96(%r11, %r12, 1)
+	vmaskmovps	%ymm11, %ymm15, 96(%r11, %r12, 2)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_24x4_gen_lib8, .-inner_store_l_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                                1      2             3         4        5         6            7         8        9         10
+// void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_24x4_lib8
+	.type kernel_sgemm_nt_24x4_lib8, @function
+kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_24x4_lib8
+_kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_24x4_lib8
+	.def kernel_sgemm_nt_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_24x4_lib8, .-kernel_sgemm_nt_24x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       11      12
+// void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_24x4_vs_lib8
+	.type kernel_sgemm_nt_24x4_vs_lib8, @function
+kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_24x4_vs_lib8
+_kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_24x4_vs_lib8
+	.def kernel_sgemm_nt_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_24x4_vs_lib8, .-kernel_sgemm_nt_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9           rsp+8        rsp+16    rsp+24   rsp+32       rsp+40    rsp+48   rsp+56  rsp+64  rsp+72  rsp+80
+// void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_24x4_gen_lib8
+	.type kernel_sgemm_nt_24x4_gen_lib8, @function
+kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_24x4_gen_lib8
+_kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_24x4_gen_lib8
+	.def kernel_sgemm_nt_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12 // offsetC
+	movq	ARG8, %r13 // C
+	movq	ARG9, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG10, %r10 // offsetD
+	movq	ARG11, %r11 // D
+	movq	ARG12, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG13, %r13 // m0
+	movq	ARG14, %r14 // m1
+	movq	ARG15, %r15 // n0
+	movq	ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_24x4_gen_lib8, .-kernel_sgemm_nt_24x4_gen_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8            9         10       11        12
+// void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_24x4_lib8
+	.type kernel_sgemm_nn_24x4_lib8, @function
+kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_24x4_lib8
+_kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_24x4_lib8
+	.def kernel_sgemm_nn_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_24x4_lib8, .-kernel_sgemm_nn_24x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8            9         10       11        12       13      14
+// void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_24x4_vs_lib8
+	.type kernel_sgemm_nn_24x4_vs_lib8, @function
+kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_24x4_vs_lib8
+_kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_24x4_vs_lib8
+	.def kernel_sgemm_nn_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12   // C
+	movq	ARG10, %r13   // sdc
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG13, %r12 // km
+	movq	ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_24x4_vs_lib8, .-kernel_sgemm_nn_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                    rdi    rsi           rdx       rcx      r8        r9        rsp+8    rsp+16       rsp+24    rsp+32    rsp+40   rsp+48    rsp+56    rsp+64   rsp+72  rsp+80  rsp+88  rsp+96
+// void kernel_sgemm_nn_24x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_24x4_gen_lib8
+	.type kernel_sgemm_nn_24x4_gen_lib8, @function
+kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_24x4_gen_lib8
+_kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_24x4_gen_lib8
+	.def kernel_sgemm_nn_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG6, %r13  // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG8, %r11 // beta
+	movq	ARG9, %r12 // offsetC
+	movq	ARG10, %r13 // C
+	movq	ARG11, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG12, %r10 // offsetD
+	movq	ARG13, %r11 // D
+	movq	ARG14, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG15, %r13 // m0
+	movq	ARG16, %r14 // m1
+	movq	ARG17, %r15 // n0
+	movq	ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_24x4_gen_lib8, .-kernel_sgemm_nn_24x4_gen_lib8
+#endif
+
+
+
+
+
+//                                       rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32 
+// void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_24x4_lib8
+	.type kernel_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_24x4_lib8
+_kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_24x4_lib8
+	.def kernel_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_24x4_lib8, .-kernel_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+//                                          rdi    rsi       rdx      rcx       r8        r9       rsp+8     rsp+16   rsp+24    rsp+32             rsp+40  rsp+48
+// void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+	sall	$5, %r12d // 4*sda*sizeof(double)
+	movq	ARG4, %r13
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG11, %r12 // m1 
+	movq	ARG12, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                             1       2          3         4          5       6          7         8          9         10       11        12       13        14
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movl	$4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+//                                                1       2          3         4          5       6          7         8          9         10       11        12       13        14                 15      16
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sda*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10  // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG13, %r10  // E 
+	movq	ARG14, %r11  // inv_diag_E 
+	movq	ARG16, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG15, %r12 // km 
+	movq	ARG16, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_20x4_lib8
+	.type kernel_spotrf_nt_l_20x4_lib8, @function
+kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_20x4_lib8
+_kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_20x4_lib8
+	.def kernel_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_20x4_vs_lib8
+	.type kernel_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_20x4_vs_lib8
+_kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_20x4_vs_lib8
+	.def kernel_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2         3        4         5         6        7         8        9
+// void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_24x4_lib8
+	.type kernel_spotrf_nt_l_24x4_lib8, @function
+kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_24x4_lib8
+_kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_24x4_lib8
+	.def kernel_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2         3        4         5         6        7         8        9                  10      11
+// void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_24x4_vs_lib8
+	.type kernel_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_24x4_vs_lib8
+_kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_24x4_vs_lib8
+	.def kernel_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG2, %r11 // A
+	movq	ARG3, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG5, %r10 // C
+	movq	ARG6, %r11 // sdc
+	sall	$5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // m1 
+	movq	ARG11, %r13 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_20x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_20x4_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_20x4_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1        2          3         4          5       6          7         8          9         10       11        12       13
+// void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_24x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_24x4_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_24x4_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+//                                            1        2          3         4          5       6          7         8          9         10       11        12       13                14      15
+// void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+	vmovapd	%ymm0, %ymm4
+	vmovapd	%ymm0, %ymm5
+	vmovapd	%ymm0, %ymm6
+	vmovapd	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12 // sdap
+	sall	$5, %r12d   // 4*sdap*sizeof(double)
+	movq	ARG4, %r13  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG5, %r10                 // km
+	movq	ARG6, %r11                   // Am
+	movq	ARG7, %r12 // sdam
+	sall	$5, %r12d                   // 4*sdam*sizeof(double)
+	movq	ARG8, %r13  // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner blender nn
+
+	movq	ARG9, %r10 // C
+	movq	ARG10, %r11 // sdc
+	sall	$5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG13, %r10  // inv_diag_D 
+	movq	ARG15, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG11, %r10 // store address D
+	movq	ARG12, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+	movq	ARG14, %r12 // km 
+	movq	ARG15, %r13 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+	
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_24x4_lib8
+	.type kernel_ssyrk_nt_l_24x4_lib8, @function
+kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_24x4_lib8
+_kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_24x4_lib8
+	.def kernel_ssyrk_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_24x4_lib8, .-kernel_ssyrk_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       11      12
+// void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_24x4_vs_lib8
+	.type kernel_ssyrk_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_24x4_vs_lib8
+_kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_24x4_vs_lib8
+	.def kernel_ssyrk_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_24x4_vs_lib8, .-kernel_ssyrk_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+//                                1      2             3         4        5         6            7         8        9         10
+// void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_20x4_lib8
+	.type kernel_ssyrk_nt_l_20x4_lib8, @function
+kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_20x4_lib8
+_kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_20x4_lib8
+	.def kernel_ssyrk_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_20x4_lib8, .-kernel_ssyrk_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5         6            7         8        9         10       11      12
+// void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_20x4_vs_lib8
+	.type kernel_ssyrk_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_20x4_vs_lib8
+_kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_20x4_vs_lib8
+	.def kernel_ssyrk_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12 // sda
+	sall	$5, %r12d // 8*sda*sizeof(float)
+	movq	ARG5, %r13  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	%rsi, %r10 // alpha
+	movq	ARG6, %r11 // beta
+	movq	ARG7, %r12   // C
+	movl	ARG8, %r13d // sdc
+	sall	$5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movl	ARG10, %r11d // sdd
+	sall	$5, %r11d // 8*sdd*sizeof(float)
+	movq	ARG11, %r12 // km
+	movq	ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+	call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_20x4_vs_lib8, .-kernel_ssyrk_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   1      2             3         4        5            6         7        8         9
+// void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_24x4_lib8
+	.type kernel_strmm_nn_rl_24x4_lib8, @function
+kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_24x4_lib8
+_kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_24x4_lib8
+	.def kernel_strmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_24x4_lib8, .-kernel_strmm_nn_rl_24x4_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4        5            6         7        8         9        10      11
+// void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_24x4_vs_lib8
+	.type kernel_strmm_nn_rl_24x4_vs_lib8, @function
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_24x4_vs_lib8
+_kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_24x4_vs_lib8
+	.def kernel_strmm_nn_rl_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+	vmovaps	%ymm0, %ymm8
+	vmovaps	%ymm0, %ymm9
+	vmovaps	%ymm0, %ymm10
+	vmovaps	%ymm0, %ymm11
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // sdb
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG6, %r13 // B
+	movq	ARG7, %r14 // sdb
+	sall	$5, %r14d // 4*sdb*sizeof(double)
+	movq	ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG8, %r10 // D
+	movq	ARG9, %r11 // sdd
+	sall	$5, %r11d // 4*sdd*sizeof(double)
+	movq	ARG10, %r12 // km
+	movq	ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_24x4_vs_lib8, .-kernel_strmm_nn_rl_24x4_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#endif
+
+#if defined(OS_LINUX)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x4_lib8.S b/kernel/avx2/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..44946f1
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,7342 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+
+	vxorps			%ymm4, %ymm4, %ymm4
+	vmovaps			%ymm4, %ymm5
+	vmovaps			%ymm4, %ymm6
+	vmovaps			%ymm4, %ymm7
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vmovaps			32(%r11), %ymm14 // A
+	vbroadcastss	4(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovaps			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vmovaps			-32(%r11), %ymm14 // A
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vmovaps			32(%r11), %ymm14 // a
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+	vmovaps			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vmovaps			-32(%r11), %ymm14 // A
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm4
+//	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm5
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm6
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfmadd231ps		%ymm14, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // a
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	subl	$1, %r10d
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	addq	$32, %r11
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	addq	$32, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddps			%ymm4, %ymm0, %ymm0
+	vaddps			%ymm5, %ymm1, %ymm1
+	vaddps			%ymm6, %ymm2, %ymm2
+	vaddps			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	vxorps			%ymm4, %ymm4, %ymm4
+	vmovaps			%ymm4, %ymm5
+	vmovaps			%ymm4, %ymm6
+	vmovaps			%ymm4, %ymm7
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	128(%r12), %ymm14 // B
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	32(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	64(%r12), %ymm14 // B
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	96(%r12), %ymm15 // B
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+//	vbroadcastf128	128(%r12), %ymm14 // B
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+//	vbroadcastf128	32(%r12), %ymm15 // B
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm0, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm1, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm2, %ymm2
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vmulps			%ymm12, %ymm14, %ymm11
+	vaddps			%ymm11, %ymm3, %ymm3
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddps			%ymm4, %ymm0, %ymm0
+	vaddps			%ymm5, %ymm1, %ymm1
+	vaddps			%ymm6, %ymm2, %ymm2
+	vaddps			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1  <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2  <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3  <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		5f // return
+
+	// preload
+	vmovaps 		0(%r11), %ymm13 // A
+
+	vxorps			%ymm4, %ymm4, %ymm4
+	vmovaps			%ymm4, %ymm5
+	vmovaps			%ymm4, %ymm6
+	vmovaps			%ymm4, %ymm7
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vbroadcastss	0(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vmovaps			32(%r11), %ymm14 // A
+	vbroadcastss	4(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovaps			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vmovaps			-32(%r11), %ymm14 // A
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vmovaps			32(%r11), %ymm14 // a
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	subl	$4, %r10d
+
+	// unroll 0
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+	vmovaps			64(%r11), %ymm13 // A
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vmovaps			-32(%r11), %ymm14 // A
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	addq	$128, %r12
+
+	// unroll 0
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm4
+//	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm5
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm6
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm14, %ymm12, %ymm7
+
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // a
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	subl	$1, %r10d
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	addq	$32, %r11
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	addq	$32, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // reduce
+
+	vaddps			%ymm4, %ymm0, %ymm0
+	vaddps			%ymm5, %ymm1, %ymm1
+	vaddps			%ymm6, %ymm2, %ymm2
+	vaddps			%ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	vxorps			%ymm4, %ymm4, %ymm4
+	vmovaps			%ymm4, %ymm5
+	vmovaps			%ymm4, %ymm6
+	vmovaps			%ymm4, %ymm7
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+//	prefetcht0	0(%r12, %r13, 1) // software prefetch
+//	prefetcht0	64(%r12, %r13, 1) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+	subl	$8, %r10d
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	addq	$256, %r11
+
+	// unroll 7
+	vmovaps			-32(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+	addq	%r12, %r13
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+	vaddps			%ymm4, %ymm0, %ymm0
+	vaddps			%ymm5, %ymm1, %ymm1
+	vaddps			%ymm6, %ymm2, %ymm2
+	vaddps			%ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	vxorps			%ymm4, %ymm4, %ymm4
+	vmovaps			%ymm4, %ymm5
+	vmovaps			%ymm4, %ymm6
+	vmovaps			%ymm4, %ymm7
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	prefetcht0	0(%r12, %r13, 1) // software prefetch
+	prefetcht0	64(%r12, %r13, 1) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm3
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm4
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm5
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm6
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm7
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm3
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm4
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm5
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm6
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm7
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm3
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm4
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm5
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm6
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm7
+	subl	$8, %r10d
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm3
+	addq	$256, %r11
+
+	// unroll 7
+	vmovaps			-32(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm4
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm5
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm6
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm7
+	addq	%r12, %r13
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+	vaddps			%ymm4, %ymm0, %ymm0
+	vaddps			%ymm5, %ymm1, %ymm1
+	vaddps			%ymm6, %ymm2, %ymm2
+	vaddps			%ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfnmadd231ps	%ymm12, %ymm13, %ymm3
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %r15d
+	subl			%r14d, %r15d // 8-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+	vbroadcastss	96(%r12), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm3, %ymm3
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- [d00 d10 d20 d30]
+// ymm1  <- [d01 d11 d21 d31]
+// ymm2  <- [d02 d12 d22 d32]
+// ymm3  <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+	
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	movl		%r14d, %eax
+	sall		$2, %eax // offsetB*sizeof(float)
+	movq		%r12, %rbx // B
+	addq		%rax, %rbx // B+offsetB*sizeof(float)
+
+
+	cmpl	$4, %r14d
+	jg		1f
+
+	// offB==0, 1, 2, 3, 4
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$5, %r14d
+	jg		1f
+
+	// offB==5
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	8(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	40(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	72(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movl		$0, %r14d // offsetB=0
+
+	jmp			0f // end
+
+
+1:
+	cmpl	$6, %r14d
+	jg		1f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	64(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	jmp			0f // end
+
+
+1:
+//	cmpl	$7, %r14d
+//	jg		0f
+
+	// offB==6
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addq		%r13, %r12 // B+8*sdb*sizeof(float)
+	movq		%r12, %rbx // B
+	movl		$0, %r14d // offsetB=0
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	0(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	32(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+	cmpl		$0, %r10d
+	jle			0f // end
+
+	vmovaps			0(%r11), %ymm8
+	vbroadcastss	4(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm0, %ymm0
+	vbroadcastss	36(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm1, %ymm1
+	vbroadcastss	68(%rbx), %ymm12
+	vmulps			%ymm8, %ymm12, %ymm15
+	vaddps			%ymm15, %ymm2, %ymm2
+
+	subl		$1, %r10d // k-1
+	addq		$32, %r11 // A+1*bs*sizeof(float)
+	addl		$1, %r14d // offsetB+1
+
+//	jmp			0f // end
+
+
+	// end
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	cmpl			$2, %r12d
+	jl				0f // ret
+	vbroadcastss	4(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm1, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	cmpl			$3, %r12d
+	jl				0f // ret
+	vbroadcastss	40(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm2, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	cmpl			$4, %r12d
+	jl				0f // ret
+	vbroadcastss	76(%r10), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm12
+	vsubps			%ymm12, %ymm3, %ymm3
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_4x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %xmm13
+	vmulps			%xmm0, %xmm13, %xmm0
+	vbroadcastss	4(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm1
+	vbroadcastss	8(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm2
+	vbroadcastss	12(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm3
+	vbroadcastss	16(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm4
+	vbroadcastss	20(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm5
+	vbroadcastss	24(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm6
+	vbroadcastss	28(%r10), %xmm13
+	vfnmadd231ps	%xmm0, %xmm13, %xmm7
+
+	vbroadcastss	4(%r11), %xmm13
+	vmulps			%xmm1, %xmm13, %xmm1
+	vbroadcastss	40(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm2
+	vbroadcastss	44(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm3
+	vbroadcastss	48(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm4
+	vbroadcastss	52(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm5
+	vbroadcastss	56(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm6
+	vbroadcastss	60(%r10), %xmm13
+	vfnmadd231ps	%xmm1, %xmm13, %xmm7
+
+	vbroadcastss	8(%r11), %xmm13
+	vmulps			%xmm2, %xmm13, %xmm2
+	vbroadcastss	76(%r10), %xmm13
+	vfnmadd231ps	%xmm2, %xmm13, %xmm3
+	vbroadcastss	80(%r10), %xmm13
+	vfnmadd231ps	%xmm2, %xmm13, %xmm4
+	vbroadcastss	84(%r10), %xmm13
+	vfnmadd231ps	%xmm2, %xmm13, %xmm5
+	vbroadcastss	88(%r10), %xmm13
+	vfnmadd231ps	%xmm2, %xmm13, %xmm6
+	vbroadcastss	92(%r10), %xmm13
+	vfnmadd231ps	%xmm2, %xmm13, %xmm7
+
+	vbroadcastss	12(%r11), %xmm13
+	vmulps			%xmm3, %xmm13, %xmm3
+	vbroadcastss	112(%r10), %xmm13
+	vfnmadd231ps	%xmm3, %xmm13, %xmm4
+	vbroadcastss	116(%r10), %xmm13
+	vfnmadd231ps	%xmm3, %xmm13, %xmm5
+	vbroadcastss	120(%r10), %xmm13
+	vfnmadd231ps	%xmm3, %xmm13, %xmm6
+	vbroadcastss	124(%r10), %xmm13
+	vfnmadd231ps	%xmm3, %xmm13, %xmm7
+
+	vbroadcastss	16(%r11), %xmm13
+	vmulps			%xmm4, %xmm13, %xmm4
+	cmpl			$6, %r12d
+	jl				0f // ret
+	vbroadcastss	148(%r10), %xmm13
+	vfnmadd231ps	%xmm4, %xmm13, %xmm5
+	vbroadcastss	152(%r10), %xmm13
+	vfnmadd231ps	%xmm4, %xmm13, %xmm6
+	vbroadcastss	156(%r10), %xmm13
+	vfnmadd231ps	%xmm4, %xmm13, %xmm7
+
+	vbroadcastss	20(%r11), %xmm13
+	vmulps			%xmm5, %xmm13, %xmm5
+	cmpl			$7, %r12d
+	jl				0f // ret
+	vbroadcastss	184(%r10), %xmm13
+	vfnmadd231ps	%xmm5, %xmm13, %xmm6
+	vbroadcastss	188(%r10), %xmm13
+	vfnmadd231ps	%xmm5, %xmm13, %xmm7
+
+	vbroadcastss	24(%r11), %xmm13
+	vmulps			%xmm6, %xmm13, %xmm6
+	cmpl			$8, %r12d
+	jl				0f // ret
+	vbroadcastss	220(%r10), %xmm13
+	vfnmadd231ps	%xmm6, %xmm13, %xmm7
+
+	vbroadcastss	28(%r11), %xmm13
+	vmulps			%xmm7, %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_4x8_vs_lib8, .-inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss		%xmm0, %xmm0, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+2:
+	vmovss		%xmm13, 0(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm0
+	cmpl		$2, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps	$0x55, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm1, %ymm1
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm0, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0x55, %xmm1, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+4:
+	vmovss		%xmm13, 4(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm1
+	cmpl		$3, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps	$0xaa, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm2, %ymm2
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm1, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xaa, %xmm2, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+6:
+	vmovss		%xmm13, 8(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm2
+	cmpl		$4, %r11d
+	jl			0f // ret
+	vperm2f128	$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps	$0xff, %ymm11, %ymm13
+	vmulps		%ymm2, %ymm13, %ymm12
+	vsubps		%ymm12, %ymm3, %ymm3
+
+
+	vpermilps	$0xff, %xmm3, %xmm13
+	vucomiss	%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss		%xmm13, %xmm13, %xmm13
+	vdivss		%xmm13, %xmm14, %xmm13
+8:
+	vmovsd		%xmm13, 12(%r10)
+	vpermilps	$0x00, %xmm13, %xmm13
+	vinsertf128	$0x1, %xmm13, %ymm13, %ymm13
+	vmulps		%ymm3, %ymm13, %ymm3
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// transpose
+	vunpcklps	%ymm1, %ymm0, %ymm5
+	vunpckhps	%ymm1, %ymm0, %ymm4
+	vunpcklps	%ymm3, %ymm2, %ymm7
+	vunpckhps	%ymm3, %ymm2, %ymm6
+
+	vunpcklpd	%ymm7, %ymm5, %ymm0
+	vunpckhpd	%ymm7, %ymm5, %ymm1
+	vunpcklpd	%ymm6, %ymm4, %ymm2
+	vunpckhpd	%ymm6, %ymm4, %ymm3
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r13), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// transpose
+	vunpcklps	%ymm1, %ymm0, %ymm5
+	vunpckhps	%ymm1, %ymm0, %ymm4
+	vunpcklps	%ymm3, %ymm2, %ymm7
+	vunpckhps	%ymm3, %ymm2, %ymm6
+
+	vunpcklpd	%ymm7, %ymm5, %ymm0
+	vunpckhpd	%ymm7, %ymm5, %ymm1
+	vunpcklpd	%ymm6, %ymm4, %ymm2
+	vunpckhpd	%ymm6, %ymm4, %ymm3
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm0
+	vmovaps		32(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm1
+	vmovaps		64(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm2
+	vmovaps		96(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm3
+	vmovaps		128(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm4
+	vmovaps		160(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm5
+	vmovaps		192(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm6
+	vmovaps		224(%r12), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x4_lib8, @function
+inner_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib8:
+#endif
+#endif
+	
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovaps		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovaps		LC03(%rip), %ymm14
+#endif
+
+	vmovaps		0(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x4_lib8, .-inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_tran_scale_11_4x8_lib8, @function
+inner_tran_scale_11_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_tran_scale_11_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib8:
+#endif
+#endif
+	
+	// transpose
+	vunpcklps	%ymm1, %ymm0, %ymm5
+	vunpckhps	%ymm1, %ymm0, %ymm4
+	vunpcklps	%ymm3, %ymm2, %ymm7
+	vunpckhps	%ymm3, %ymm2, %ymm6
+
+	vunpcklpd	%ymm7, %ymm5, %ymm0
+	vunpckhpd	%ymm7, %ymm5, %ymm1
+	vunpcklpd	%ymm6, %ymm4, %ymm2
+	vunpckhpd	%ymm6, %ymm4, %ymm3
+
+	vextractf128 $0x1, %ymm0, %xmm4
+	vextractf128 $0x1, %ymm1, %xmm5
+	vextractf128 $0x1, %ymm2, %xmm6
+	vextractf128 $0x1, %ymm3, %xmm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovaps		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovaps		LC03(%rip), %ymm14
+#endif
+
+	vmovaps		0(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm0
+	vmovaps		32(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm1
+	vmovaps		64(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm2
+	vmovaps		96(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm3
+	vmovaps		128(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm4
+	vmovaps		160(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm5
+	vmovaps		192(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm6
+	vmovaps		224(%r10), %xmm15
+	vfmadd231ps	%xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_tran_scale_11_4x8_lib8, .-inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x4_gen_lib8, @function
+inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_gen_lib8:
+#endif
+#endif
+	
+
+	// offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovaps		.LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+	vmovaps		LC03(%rip), %ymm14
+#endif
+
+	vmovaps		0(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r11), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x4_gen_lib8, .-inner_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vmulps		%ymm15, %ymm14, %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// alpha
+	vbroadcastss	0(%r10), %ymm15
+
+	vmulps		%ymm0, %ymm15, %ymm0
+	vmulps		%ymm1, %ymm15, %ymm1
+	vmulps		%ymm2, %ymm15, %ymm2
+	vmulps		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vmulps		%ymm12, %ymm15, %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm10
+	vblendps	$0x55, %ymm3, %ymm2, %ymm11
+
+	vblendps	$0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm10, %ymm9, %ymm1
+	vblendps	$0x33, %ymm10, %ymm9, %ymm3
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %r15 // C0
+	addq	%r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%xmm0,  0(%r10)
+	vmovaps 	%xmm1, 32(%r10)
+	vmovaps 	%xmm2, 64(%r10)
+	vmovaps 	%xmm3, 96(%r10)
+	vmovaps 	%xmm4, 128(%r10)
+	vmovaps 	%xmm5, 160(%r10)
+	vmovaps 	%xmm6, 192(%r10)
+	vmovaps 	%xmm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%ymm14, %ymm12, %ymm14
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm1, %ymm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm2, %ymm14, 64(%r10)
+	je			0f // end
+	vmaskmovps	%ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vsubps		%xmm14, %xmm12, %xmm14
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm14,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm1, %xmm14, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm2, %xmm14, 64(%r10)
+	cmpl		$4, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm3, %xmm14, 96(%r10)
+	cmpl		$5, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm4, %xmm14, 128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm5, %xmm14, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%xmm6, %xmm14, 192(%r10)
+	je			0f // end
+	vmaskmovps	%xmm7, %xmm14, 224(%r10)
+	//
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %xmm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%xmm12, %xmm14, %xmm14
+	vsubps		%xmm15, %xmm12, %xmm15
+	vandps		%xmm14, %xmm15, %xmm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	vmovaps		%xmm7, %xmm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	vmovaps		%xmm6, %xmm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	vmovaps		%xmm5, %xmm4
+	addq		$32, %r11
+
+	cmpl	$3, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	vmovaps		%xmm4, %xmm3
+	addq		$32, %r11
+
+	cmpl	$4, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	vmovaps		%xmm3, %xmm2
+	addq		$32, %r11
+
+	cmpl	$5, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	vmovaps		%xmm2, %xmm1
+	addq		$32, %r11
+
+	cmpl	$6, %r15d
+	jle		0f
+
+	vmovaps		%xmm1, %xmm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%xmm0, %xmm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm1, %xmm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm2, %xmm15, 64(%r11)
+	cmpl		$4, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm3, %xmm15, 96(%r11)
+	cmpl		$5, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm4, %xmm15, 128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm5, %xmm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%xmm6, %xmm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%xmm7, %xmm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+	
+	vmovaps 	32(%r10), %ymm12
+	vmovaps 	64(%r10), %ymm13
+	vmovaps 	96(%r10), %ymm14
+
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vblendps	$0x3, %ymm13, %ymm2, %ymm2
+	vblendps	$0x7, %ymm14, %ymm3, %ymm3
+
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	cmpl		$2, %r12d
+	jl			0f // end
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r10)
+	cmpl		$3, %r12d
+	jl			0f // end
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r10)
+	je			0f // end
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r10)
+	//
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	cmpl		$2, %r15d
+	jl			7f // end
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x1, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15, 32(%r11)
+	cmpl		$3, %r15d
+	jl			7f // end
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x3, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15, 64(%r11)
+	je			7f // end
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x7, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15, 96(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_lib8
+	.def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_lib8
+	.def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+//                               rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_vs_lib8
+	.def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_vs_lib8
+	.def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // B
+	movq	ARG3, %r12  // A
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x4_gen_lib8
+	.def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_4x8_gen_lib8
+	.def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG4, %r11  // A
+	movq	ARG3, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+//                               0      1             2         3            4         5        6            7         8
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_lib8
+	.def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_vs_lib8
+	.def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x4_gen_lib8
+	.def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+//                                 1      2             3         4         5            6         7
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_lib8
+	.def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_4x8_lib8
+	.type kernel_strsm_nt_rl_inv_4x8_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_4x8_lib8
+_kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_4x8_lib8
+	.def kernel_strsm_nt_rl_inv_4x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11
+	movq	ARG2, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	$8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_4x8_lib8, .-kernel_strsm_nt_rl_inv_4x8_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+	.type kernel_strsm_nt_rl_inv_4x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_4x8_vs_lib8
+_kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+	.def kernel_strsm_nt_rl_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG3, %r11
+	movq	ARG2, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_4x8_vs_lib8, .-kernel_strsm_nt_rl_inv_4x8_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2         3         4         5         6
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_lib8
+	.def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4            5         6        7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_lib8
+	.def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+//                                     1      2             3         4            5         6        7         8       9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_vs_lib8
+	.def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+//                                      1      2             3         4            5         6        7            8         9        10      11      12      13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strmm_nn_rl_8x4_gen_lib8
+	.def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // offsetD
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG10, %r13 // m0
+	movq	ARG11, %r14 // m1
+	movq	ARG12, %r15 // n0
+	movq	ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x8_lib8.S b/kernel/avx2/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..094acda
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5395 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp); \
+	vzeroupper;
+#define EPILOGUE \
+	vzeroupper; \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+	
+	
+// broadcast scheme
+#if 1
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+
+	cmpl	$3, %r10d
+	jle		4f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	0(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vbroadcastss	16(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm4
+	vbroadcastss	20(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm5
+	vbroadcastss	24(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm6
+	vbroadcastss	28(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vmovaps			32(%r11), %ymm13 // A
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vbroadcastss	48(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm4
+	vbroadcastss	52(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm5
+	vbroadcastss	56(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm6
+	vbroadcastss	60(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vmovaps			-64(%r11), %ymm13 // A
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vbroadcastss	80(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm4
+	vbroadcastss	84(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm5
+	vbroadcastss	88(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm6
+	vbroadcastss	92(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm7
+	addq	$128, %r12
+
+	// unroll 0
+	vmovaps			-32(%r11), %ymm13 // A
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vbroadcastss	-16(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm4
+	vbroadcastss	-12(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm5
+	vbroadcastss	-8(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm6
+	vbroadcastss	-4(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // a
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm1
+	subl	$1, %r10d
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm2
+	addq	$32, %r11
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfmadd231ps		%ymm13, %ymm12, %ymm3
+	vbroadcastss	16(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm4
+	vbroadcastss	20(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm5
+	vbroadcastss	24(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm6
+	vbroadcastss	28(%r12), %ymm12 // B
+	vfmadd231ps		%ymm13, %ymm12, %ymm7
+	addq	$32, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	32(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm7
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm3
+	vbroadcastf128	64(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	96(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm7
+	vbroadcastf128	112(%r12), %ymm15 // B
+	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vfmadd231ps		%ymm13, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm3
+	vbroadcastf128	0(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	16(%r12), %ymm15 // B
+	vmovaps			32(%r11), %ymm13 // A
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	32(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm7
+	vbroadcastf128	48(%r12), %ymm15 // B
+	vmovaps			64(%r11), %ymm12 // A
+
+
+	// unroll 1
+	vfmadd231ps		%ymm13, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm3
+	vbroadcastf128	64(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+	vbroadcastf128	80(%r12), %ymm15 // B
+	vmovaps			96(%r11), %ymm13 // A
+
+
+	// unroll 2
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+	vbroadcastf128	96(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm12, %ymm15, %ymm7
+	vbroadcastf128	112(%r12), %ymm15 // B
+//	vmovaps			128(%r11), %ymm12 // A
+
+	subl	$4, %r10d
+	addq	$128, %r11
+	addq	$128, %r12
+
+	// unroll 3
+	vfmadd231ps		%ymm13, %ymm14, %ymm0
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm1
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm2
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+
+	vfmadd231ps		%ymm13, %ymm14, %ymm3
+//	vbroadcastf128	0(%r12), %ymm14 // B
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm4
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm5
+	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm6
+	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
+
+	vfmadd231ps		%ymm13, %ymm15, %ymm7
+//	vbroadcastf128	16(%r12), %ymm15 // B
+//	vmovaps			32(%r11), %ymm13 // A
+
+
+//	cmpl	$4, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vbroadcastf128	0(%r12), %ymm14 // B
+	vmovaps			0(%r11), %ymm12 // A
+	vfmadd231ps		%ymm12, %ymm14, %ymm0
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm1
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm2
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm3
+
+	vbroadcastf128	16(%r12), %ymm14 // B
+	vfmadd231ps		%ymm12, %ymm14, %ymm4
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm5
+
+	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm6
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$32, %r12
+
+	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
+	vfmadd231ps		%ymm12, %ymm14, %ymm7
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4  <- []
+// ymm5  <- []
+// ymm6  <- []
+// ymm7  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// preload
+
+	cmpl	$3, %r10d
+	jle		4f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // A
+	vbroadcastss	0(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	8(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	12(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vbroadcastss	16(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm4
+	vbroadcastss	20(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm5
+	vbroadcastss	24(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm6
+	vbroadcastss	28(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm7
+	subl	$4, %r10d
+
+	// unroll 0
+	vmovaps			32(%r11), %ymm13 // A
+	vbroadcastss	32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	36(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	40(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	44(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vbroadcastss	48(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm4
+	vbroadcastss	52(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm5
+	vbroadcastss	56(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm6
+	vbroadcastss	60(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm7
+	addq	$128, %r11
+
+	// unroll 0
+	vmovaps			-64(%r11), %ymm13 // A
+	vbroadcastss	64(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	68(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	72(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	76(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vbroadcastss	80(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm4
+	vbroadcastss	84(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm5
+	vbroadcastss	88(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm6
+	vbroadcastss	92(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm7
+	addq	$128, %r12
+
+	// unroll 0
+	vmovaps			-32(%r11), %ymm13 // A
+	vbroadcastss	-32(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	-28(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	vbroadcastss	-24(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	vbroadcastss	-20(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vbroadcastss	-16(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm4
+	vbroadcastss	-12(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm5
+	vbroadcastss	-8(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm6
+	vbroadcastss	-4(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm7
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm13 // a
+	vbroadcastss	0(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm0
+	vbroadcastss	4(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm1
+	subl	$1, %r10d
+	vbroadcastss	8(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm2
+	addq	$32, %r11
+	vbroadcastss	12(%r12), %ymm12 // b
+	vfnmadd231ps	%ymm13, %ymm12, %ymm3
+	vbroadcastss	16(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm4
+	vbroadcastss	20(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm5
+	vbroadcastss	24(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm6
+	vbroadcastss	28(%r12), %ymm12 // B
+	vfnmadd231ps	%ymm13, %ymm12, %ymm7
+	addq	$32, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// r14   <= dirty
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	cmpl	$8, %r10d
+	jl		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+//	prefetcht0	0(%r12, %r13, 1) // software prefetch
+//	prefetcht0	64(%r12, %r13, 1) // software prefetch
+//	prefetcht0	128(%r12, %r13, 1) // software prefetch
+//	prefetcht0	192(%r12, %r13, 1) // software prefetch
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 1
+	vmovaps			32(%r11), %ymm12 // A[0]
+	vbroadcastss	4(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	36(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	68(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	100(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	132(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	164(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	196(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	228(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 2
+	vmovaps			64(%r11), %ymm12 // A[0]
+	vbroadcastss	8(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	40(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	72(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	104(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	136(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	168(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	200(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	232(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 3
+	vmovaps			96(%r11), %ymm12 // A[0]
+	vbroadcastss	12(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	44(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	76(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	108(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	140(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	172(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	204(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	236(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 4
+	vmovaps			128(%r11), %ymm12 // A[0]
+	vbroadcastss	16(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	48(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	80(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	112(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	144(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	176(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	208(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	240(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	// unroll 5
+	vmovaps			160(%r11), %ymm12 // A[0]
+	vbroadcastss	20(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	52(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	84(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	116(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	148(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	180(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	212(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	244(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+	subl	$8, %r10d
+
+	// unroll 6
+	vmovaps			192(%r11), %ymm12 // A[0]
+	vbroadcastss	24(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	56(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	88(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	120(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	152(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	184(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	216(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	248(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+	addq	$256, %r11
+
+	// unroll 7
+	vmovaps			-32(%r11), %ymm12 // A[0]
+	vbroadcastss	28(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	60(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	92(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	124(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	156(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	188(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	220(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	252(%r12), %ymm13 // B[7]
+	addq	%r12, %r13
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	cmpl	$7, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean1-up loop
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+3: // clean1-up loop
+	
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	subl	$1, %r10d
+	addq	$32, %r11
+	addq	$4, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// ymm0  <- []
+// ymm1  <- []
+// ymm2  <- []
+// ymm3  <- []
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$8, %ebx
+	subl			%r14d, %ebx // 8-offsetB
+	cmpl			%r10d, %ebx
+//	jle				0f
+//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
+
+	movl			%r14d, %eax
+	sall			$2, %eax // offsetB*sizeof(float)
+	addq			%rax, %r12 // B+offsetB*sizeof(float)
+
+	// unroll 0
+	vmovaps			0(%r11), %ymm12 // A[0]
+	vbroadcastss	0(%r12), %ymm13 // B[0]
+	vfmadd231ps		%ymm12, %ymm13, %ymm0
+	vbroadcastss	32(%r12), %ymm13 // B[1]
+	vfmadd231ps		%ymm12, %ymm13, %ymm1
+	vbroadcastss	64(%r12), %ymm13 // B[2]
+	vfmadd231ps		%ymm12, %ymm13, %ymm2
+	vbroadcastss	96(%r12), %ymm13 // B[3]
+	vfmadd231ps		%ymm12, %ymm13, %ymm3
+	vbroadcastss	128(%r12), %ymm13 // B[4]
+	vfmadd231ps		%ymm12, %ymm13, %ymm4
+	vbroadcastss	160(%r12), %ymm13 // B[5]
+	vfmadd231ps		%ymm12, %ymm13, %ymm5
+	vbroadcastss	192(%r12), %ymm13 // B[6]
+	vfmadd231ps		%ymm12, %ymm13, %ymm6
+	vbroadcastss	224(%r12), %ymm13 // B[7]
+	vfmadd231ps		%ymm12, %ymm13, %ymm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %ebx // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$4, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %ebx
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+	vbroadcastss	0(%r11), %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vbroadcastss	4(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vbroadcastss	8(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vbroadcastss	12(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vbroadcastss	16(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm4
+	vbroadcastss	20(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm5
+	vbroadcastss	24(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm6
+	vbroadcastss	28(%r10), %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm7
+
+	vbroadcastss	4(%r11), %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vbroadcastss	40(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vbroadcastss	44(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vbroadcastss	48(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm4
+	vbroadcastss	52(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm5
+	vbroadcastss	56(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm6
+	vbroadcastss	60(%r10), %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm7
+
+	vbroadcastss	8(%r11), %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vbroadcastss	76(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vbroadcastss	80(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm4
+	vbroadcastss	84(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm5
+	vbroadcastss	88(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm6
+	vbroadcastss	92(%r10), %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm7
+
+	vbroadcastss	12(%r11), %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vbroadcastss	112(%r10), %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm4
+	vbroadcastss	116(%r10), %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm5
+	vbroadcastss	120(%r10), %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm6
+	vbroadcastss	124(%r10), %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm7
+
+	vbroadcastss	16(%r11), %ymm13
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl			$6, %r12d
+	jl				0f // ret
+	vbroadcastss	148(%r10), %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vbroadcastss	152(%r10), %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vbroadcastss	156(%r10), %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+
+	vbroadcastss	20(%r11), %ymm13
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl			$7, %r12d
+	jl				0f // ret
+	vbroadcastss	184(%r10), %ymm13
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vbroadcastss	188(%r10), %ymm13
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+
+	vbroadcastss	24(%r11), %ymm13
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl			$8, %r12d
+	jl				0f // ret
+	vbroadcastss	220(%r10), %ymm13
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+
+	vbroadcastss	28(%r11), %ymm13
+	vmulps			%ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovss	.LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovss	LC03(%rip), %xmm14 // 1.0
+#endif
+
+	vmovss			%xmm0, %xmm0, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe			1f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+2:
+	vmovss			%xmm13, 0(%r10)
+	vbroadcastss	%xmm13, %ymm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+//	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
+	vmulps			%ymm0, %ymm13, %ymm0
+	vperm2f128		$0x00, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm1
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm2
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm3
+	vperm2f128		$0x11, %ymm0, %ymm0, %ymm11
+	vpermilps		$0x00, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm4
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm0, %ymm13, %ymm7
+
+
+	vpermilps		$0x55, %xmm1, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe			3f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+4:
+	vmovss			%xmm13, 4(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm1, %ymm13, %ymm1
+	vperm2f128		$0x00, %ymm1, %ymm1, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm2
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm3
+	vperm2f128		$0x11, %ymm1, %ymm1, %ymm11
+	vpermilps		$0x00, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm4
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm1, %ymm13, %ymm7
+
+
+	vpermilps		$0xaa, %xmm2, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe			5f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+6:
+	vmovss			%xmm13, 8(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm2, %ymm13, %ymm2
+	vperm2f128		$0x00, %ymm2, %ymm2, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm3
+	vperm2f128		$0x11, %ymm2, %ymm2, %ymm11
+	vpermilps		$0x00, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm4
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm2, %ymm13, %ymm7
+
+
+	vpermilps		$0xff, %xmm3, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			7f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+8:
+	vmovss			%xmm13, 12(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm3, %ymm13, %ymm3
+	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
+	vpermilps		$0x00, %ymm11, %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm4
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm3, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm4, %xmm13
+//	vpermilps		$0x00, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			9f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+10:
+	vmovss			%xmm13, 16(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm4, %ymm13, %ymm4
+	cmpl		$6, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm4, %ymm4, %ymm11
+	vpermilps		$0x55, %ymm11, %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm5
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm4, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm5, %xmm13
+	vpermilps		$0x55, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			11f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+12:
+	vmovss			%xmm13, 20(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm5, %ymm13, %ymm5
+	cmpl		$7, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm5, %ymm5, %ymm11
+	vpermilps		$0xaa, %ymm11, %ymm13
+	vfnmadd231ps	%ymm5, %ymm13, %ymm6
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm5, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm6, %xmm13
+	vpermilps		$0xaa, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			13f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+14:
+	vmovss			%xmm13, 24(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm6, %ymm13, %ymm6
+	cmpl		$8, %r11d
+	jl			0f // ret
+	vperm2f128		$0x11, %ymm6, %ymm6, %ymm11
+	vpermilps		$0xff, %ymm11, %ymm13
+	vfnmadd231ps	%ymm6, %ymm13, %ymm7
+
+
+	vextractf128	$0x1, %ymm7, %xmm13
+	vpermilps		$0xff, %xmm13, %xmm13
+	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe			15f
+	vsqrtss			%xmm13, %xmm13, %xmm13
+	vdivss			%xmm13, %xmm14, %xmm13
+16:
+	vmovss			%xmm13, 28(%r10)
+	vbroadcastss	%xmm13, %ymm13
+	vmulps			%ymm7, %ymm13, %ymm7
+
+
+	jmp		0f
+
+
+1:
+	vxorps	%ymm13, %ymm13, %ymm13
+	jmp		2b
+
+3:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		4b
+
+5:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		6b
+
+7:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		8b
+
+9:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		10b
+
+11:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		12b
+
+13:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		14b
+
+15:
+	vxorpd	%ymm13, %ymm13, %ymm13
+	jmp		16b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// r15  <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm3
+	vmovaps		128(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x8_lib8, @function
+inner_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r10), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r10), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r10), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r10), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x8_lib8, .-inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_11_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_11_8x8_gen_lib8, @function
+inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_gen_lib8:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r11), %ymm12
+	vaddps		%ymm4, %ymm12, %ymm4
+	vmovaps		160(%r11), %ymm12
+	vaddps		%ymm5, %ymm12, %ymm5
+	vmovaps		192(%r11), %ymm12
+	vaddps		%ymm6, %ymm12, %ymm6
+	vmovaps		224(%r11), %ymm12
+	vaddps		%ymm7, %ymm12, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_11_8x8_gen_lib8, .-inner_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm14
+
+	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovaps		0(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm0
+	vmovaps		32(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm1
+	vmovaps		64(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm2
+	vmovaps		96(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm3
+	vmovaps		128(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm4
+	vmovaps		160(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm5
+	vmovaps		192(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm6
+	vmovaps		224(%r12), %ymm15
+	vfmadd231ps	%ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12  <- offset
+// r13   <- C
+// r14  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastss	0(%r10), %ymm11
+
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vmulps		%ymm0, %ymm11, %ymm0
+	vmulps		%ymm1, %ymm11, %ymm1
+	vmulps		%ymm2, %ymm11, %ymm2
+	vmulps		%ymm3, %ymm11, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmulps		%ymm4, %ymm11, %ymm4
+	vmulps		%ymm5, %ymm11, %ymm5
+	vmulps		%ymm6, %ymm11, %ymm6
+	vmulps		%ymm7, %ymm11, %ymm7
+
+	// beta
+	vbroadcastss	0(%r11), %ymm15
+
+	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
+
+	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
+	je			3f // end
+
+	cmpl	$0, %r12d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm0
+	vmovaps		32(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm1
+	vmovaps		64(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm2
+	vmovaps		96(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm3
+	vmovaps		128(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm4
+	vmovaps		160(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm5
+	vmovaps		192(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm6
+	vmovaps		224(%r13), %ymm12
+	vfmadd231ps	%ymm12, %ymm15, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r12d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r12d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r12d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	vmovaps		0(%r10), %ymm15
+	vaddps		%ymm0, %ymm15, %ymm0
+	vmovaps		32(%r10), %ymm15
+	vaddps		%ymm1, %ymm15, %ymm1
+	vmovaps		64(%r10), %ymm15
+	vaddps		%ymm2, %ymm15, %ymm2
+	vmovaps		96(%r10), %ymm15
+	vaddps		%ymm3, %ymm15, %ymm3
+	vmovaps		128(%r10), %ymm15
+	vaddps		%ymm4, %ymm15, %ymm4
+	vmovaps		160(%r10), %ymm15
+	vaddps		%ymm5, %ymm15, %ymm5
+	vmovaps		192(%r10), %ymm15
+	vaddps		%ymm6, %ymm15, %ymm6
+	vmovaps		224(%r10), %ymm15
+	vaddps		%ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10  <- offset
+// r11   <- C
+// r12  <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+	
+	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
+	vblendps	$0x55, %ymm3, %ymm2, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
+	vblendps	$0x33, %ymm14, %ymm13, %ymm3
+
+	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
+	vblendps	$0x55, %ymm5, %ymm4, %ymm13
+	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
+	vblendps	$0x55, %ymm7, %ymm6, %ymm15
+
+	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
+	vblendps	$0x33, %ymm15, %ymm12, %ymm6
+	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
+	vblendps	$0x33, %ymm14, %ymm13, %ymm7
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+
+	vmovaps		0(%r11), %ymm12
+	vaddps		%ymm0, %ymm12, %ymm0
+	vmovaps		32(%r11), %ymm12
+	vaddps		%ymm1, %ymm12, %ymm1
+	vmovaps		64(%r11), %ymm12
+	vaddps		%ymm2, %ymm12, %ymm2
+	vmovaps		96(%r11), %ymm12
+	vaddps		%ymm3, %ymm12, %ymm3
+	vmovaps		128(%r11), %ymm12
+	vaddps		%ymm4, %ymm12, %ymm4
+	vmovaps		160(%r11), %ymm12
+	vaddps		%ymm5, %ymm12, %ymm5
+	vmovaps		192(%r11), %ymm12
+	vaddps		%ymm6, %ymm12, %ymm6
+	vmovaps		224(%r11), %ymm12
+	vaddps		%ymm7, %ymm12, %ymm7
+
+	jmp		7f
+
+0:
+
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r13, %r15 // C0
+	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+	
+	vmovaps 	%ymm0,  0(%r10)
+	vmovaps		32(%r10), %ymm14
+	vblendps	$0x01, %ymm14, %ymm1, %ymm1
+	vmovaps 	%ymm1, 32(%r10)
+	vmovaps		64(%r10), %ymm14
+	vblendps	$0x03, %ymm14, %ymm2, %ymm2
+	vmovaps 	%ymm2, 64(%r10)
+	vmovaps		96(%r10), %ymm14
+	vblendps	$0x07, %ymm14, %ymm3, %ymm3
+	vmovaps 	%ymm3, 96(%r10)
+	vmovaps		128(%r10), %ymm14
+	vblendps	$0x0f, %ymm14, %ymm4, %ymm4
+	vmovaps 	%ymm4, 128(%r10)
+	vmovaps		160(%r10), %ymm14
+	vblendps	$0x1f, %ymm14, %ymm5, %ymm5
+	vmovaps 	%ymm5, 160(%r10)
+	vmovaps		192(%r10), %ymm14
+	vblendps	$0x3f, %ymm14, %ymm6, %ymm6
+	vmovaps 	%ymm6, 192(%r10)
+	vmovaps		224(%r10), %ymm14
+	vblendps	$0x7f, %ymm14, %ymm7, %ymm7
+	vmovaps 	%ymm7, 224(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- D
+// r11  <- km
+// r12  <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_VS_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm15, %ymm12, %ymm15
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r10)
+	vmovaps 	32(%r10), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r10)
+	vmovaps 	64(%r10), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r10)
+	vmovaps 	96(%r10), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r10)
+	vmovaps 	128(%r10), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r10)
+	cmpl		$6, %r12d
+	jl			0f // end
+	vmovaps 	160(%r10), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r10)
+	cmpl		$7, %r12d
+	jl			0f // end
+	vmovaps 	192(%r10), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r10)
+	je			0f // end
+	vmovaps 	224(%r10), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+	
+	// compute mask for rows
+	vcvtsi2ss	%r13d, %xmm14, %xmm14
+	vcvtsi2ss	%r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovups		.LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+	vmovups		LC00(%rip), %ymm12
+#endif
+	vshufps		$0x00, %xmm14, %xmm14, %xmm14
+	vshufps		$0x00, %xmm15, %xmm15, %xmm15
+	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
+	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
+	vsubps		%ymm12, %ymm14, %ymm14
+	vsubps		%ymm15, %ymm12, %ymm15
+	vandps		%ymm14, %ymm15, %ymm15
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	vmovaps		%ymm7, %ymm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm2, %ymm1
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	vmovaps		%ymm6, %ymm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovaps		%ymm1, %ymm0
+	vmovaps		%ymm3, %ymm2
+	vmovaps		%ymm4, %ymm3
+	vmovaps		%ymm5, %ymm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$8, %eax
+	jle		0f
+	movl	$8, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	// offset==0
+	vmaskmovps	%ymm0, %ymm15,  0(%r11)
+	vmovaps 	32(%r11), %ymm12
+	vblendps	$0x01, %ymm12, %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm15,  32(%r11)
+	vmovaps 	64(%r11), %ymm12
+	vblendps	$0x03, %ymm12, %ymm2, %ymm2
+	vmaskmovps	%ymm2, %ymm15,  64(%r11)
+	vmovaps 	96(%r11), %ymm12
+	vblendps	$0x07, %ymm12, %ymm3, %ymm3
+	vmaskmovps	%ymm3, %ymm15,  96(%r11)
+	vmovaps 	128(%r11), %ymm12
+	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
+	vmaskmovps	%ymm4, %ymm15,  128(%r11)
+	cmpl		$6, %r15d
+	jl			7f // end
+	vmovaps 	160(%r11), %ymm12
+	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
+	vmaskmovps	%ymm5, %ymm15, 160(%r11)
+	cmpl		$7, %r15d
+	jl			7f // end
+	vmovaps 	192(%r11), %ymm12
+	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
+	vmaskmovps	%ymm6, %ymm15, 192(%r11)
+	je			7f // end
+	vmovaps 	224(%r11), %ymm12
+	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
+	vmaskmovps	%ymm7, %ymm15, 224(%r11)
+	//
+	jmp		7f
+
+0:
+	// offset > 0
+	// 1 2 3 4 5 6 7
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$4, %r10d
+	jl		1f
+	jg		2f
+
+	// offset==4
+	// TODO
+	jmp		7f
+
+1:
+	// 1 2 3
+
+	cmpl	$2, %r10d
+	jl		3f
+	jg		4f
+
+	// offset==2
+	// TODO
+	jmp		7f
+
+3:
+	// offset==1
+	// TODO
+	jmp		7f
+
+4:
+	// offset==3
+	// TODO
+	jmp		7f
+
+2:
+	// 5 6 7
+
+	cmpl	$6, %r10d
+	jl		5f
+	jg		6f
+
+	// offset==6
+	// TODO
+	jmp		7f
+
+5:
+	// offset==5
+	// TODO
+	jmp		7f
+
+6:
+	// offset==7
+	// TODO
+	jmp		7f
+
+	// end
+7:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+//                               1      2             3         4         5            6         7
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_lib8
+	.def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+//                                  1      2             3         4         5            6         7         8       9
+// void kernel_sgemm_nt_8x8_vs)lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_vs_lib8
+	.def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // D
+	movq	ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nt_8x8_gen_lib8
+	.def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 8*sdb*sizeof(float)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_lib8
+	.def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+//                               1      2             3         4            5         6        7            8         9         10      11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_vs_lib8
+	.def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_nn_8x8_gen_lib8
+	.def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nn
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12 // offsetC
+	movq	ARG9, %r13 // C
+	movq	ARG10, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+	// store n gen
+
+	movq	ARG11, %r10 // offsetD
+	movq	ARG12, %r11 // D
+	movq	ARG13, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG14, %r13 // m0
+	movq	ARG15, %r14 // m1
+	movq	ARG16, %r15 // n0
+	movq	ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_lib8
+	.def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                    1      2             3         4         5            6         7         8       9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km
+	movq	ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8     
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	$8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24  
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // m1 
+	movq	ARG9, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                            1       2          3          4       5          6          7         8         9         10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	$8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+//                                               1       2          3          4       5          6          7         8         9         10                 11      12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+	.def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                  edi    rsi       rdx       rcx       r8        r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_lib8
+	.def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // m1 
+	movq	ARG8, %r12 // n1 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+//                                        1       2          3          4       5          6          7         8         9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorps	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$8, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+//                                           1       2          3          4       5          6          7         8         9                  10      11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+	.globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+	.globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+	.def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovaps	%ymm0, %ymm1
+	vmovaps	%ymm0, %ymm2
+	vmovaps	%ymm0, %ymm3
+	vmovaps	%ymm0, %ymm4
+	vmovaps	%ymm0, %ymm5
+	vmovaps	%ymm0, %ymm6
+	vmovaps	%ymm0, %ymm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+	callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+	callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+	.long	1056964608
+	.long	1069547520
+	.long	1075838976
+	.long	1080033280
+	.long	1083179008
+	.long	1085276160
+	.long	1087373312
+	.long	1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+	.long	1091043328
+	.long	1092091904
+	.long	1093140480
+	.long	1094189056
+	.long	1095237632
+	.long	1096286208
+	.long	1097334784
+	.long	1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+	.align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+	.long	1099169792
+	.long	1099694080
+	.long	1100218368
+	.long	1100742656
+	.long	1101266944
+	.long	1101791232
+	.long	1102315520
+	.long	1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+	.align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	1065353216
+	.long	3212836864
+	.long	3212836864
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/c99/Makefile b/kernel/c99/Makefile
new file mode 100644
index 0000000..55d54ef
--- /dev/null
+++ b/kernel/c99/Makefile
@@ -0,0 +1,80 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemv_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemv_4_lib4.o
+#OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+OBJS +=
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	k = 0;
+	if(offsetB!=0)
+		{
+		if(offsetB==1)
+			{
+
+			B += 1;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else if(offsetB==2)
+			{
+
+			B += 2;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			k += 1;
+
+			if(k>=kmax)
+				goto scale;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		else // if(offsetB==3)
+			{
+
+			B += 3;
+
+			a_0 = A[0];
+			a_1 = A[1];
+			a_2 = A[2];
+			a_3 = A[3];
+
+			b_0 = B[0];
+			b_1 = B[4];
+			b_2 = B[8];
+			b_3 = B[12];
+
+			c_00 += a_0 * b_0;
+			c_10 += a_1 * b_0;
+			c_20 += a_2 * b_0;
+			c_30 += a_3 * b_0;
+
+			c_01 += a_0 * b_1;
+			c_11 += a_1 * b_1;
+			c_21 += a_2 * b_1;
+			c_31 += a_3 * b_1;
+
+			c_02 += a_0 * b_2;
+			c_12 += a_1 * b_2;
+			c_22 += a_2 * b_2;
+			c_32 += a_3 * b_2;
+
+			c_03 += a_0 * b_3;
+			c_13 += a_1 * b_3;
+			c_23 += a_2 * b_3;
+			c_33 += a_3 * b_3;
+
+			A += 4;
+			B += 1;
+			B += bs*(sdb-1);
+			k += 1;
+
+			}
+		}
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}	
+	
+	scale:
+
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+	{
+	kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0,
+		c_10=0, c_11=0,
+		c_20=0, c_21=0, c_22=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[2+bs*2] = c_22;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[1+bs*1] = c_11;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	k = 0;
+
+	// k = 0
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 1
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 2
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	double *D1;
+	
+	int k;
+
+	B += offsetB;
+
+	k = 0;
+
+	if(offsetB==0)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==1)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==2)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 5
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else // if(offetB==3)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	store:
+	
+	c_00 = alpha[0]*c_00;
+	c_10 = alpha[0]*c_10;
+	c_20 = alpha[0]*c_20;
+	c_30 = alpha[0]*c_30;
+
+	c_01 = alpha[0]*c_01;
+	c_11 = alpha[0]*c_11;
+	c_21 = alpha[0]*c_21;
+	c_31 = alpha[0]*c_31;
+
+	c_02 = alpha[0]*c_02;
+	c_12 = alpha[0]*c_12;
+	c_22 = alpha[0]*c_22;
+	c_32 = alpha[0]*c_32;
+
+	c_03 = alpha[0]*c_03;
+	c_13 = alpha[0]*c_13;
+	c_23 = alpha[0]*c_23;
+	c_33 = alpha[0]*c_33;
+
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC)  || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+	{
+	kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+//	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+//	c_02 = C[0+bs*2] + c_02;
+//	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+//	c_03 = C[0+bs*3] + c_03;
+//	c_13 = C[1+bs*3] + c_13;
+//	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(c_00>0)
+		{
+		c_00 = sqrt(c_00);
+		tmp = 1.0/c_00;
+		}
+	else
+		{
+		c_00 = 0.0;
+		tmp = 0.0;
+		}
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+	
+	c_11 -= c_10 * c_10;
+	c_21 -= c_20 * c_10;
+	c_31 -= c_30 * c_10;
+	if(c_11>0)
+		{
+		c_11 = sqrt(c_11);
+		tmp = 1.0/c_11;
+		}
+	else
+		{
+		c_11 = 0.0;
+		tmp = 0.0;
+		}
+	c_21 *= tmp;
+	c_31 *= tmp;
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+	
+	c_22 -= c_20 * c_20;
+	c_32 -= c_30 * c_20;
+	c_22 -= c_21 * c_21;
+	c_32 -= c_31 * c_21;
+	if(c_22>0)
+		{
+		c_22 = sqrt(c_22);
+		tmp = 1.0/c_22;
+		}
+	else
+		{
+		c_22 = 0.0;
+		tmp = 0.0;
+		}
+	c_32 *= tmp;
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+	
+	c_33 -= c_30 * c_30;
+	c_33 -= c_31 * c_31;
+	c_33 -= c_32 * c_32;
+	if(c_33>0)
+		{
+		c_33 = sqrt(c_33);
+		tmp = 1.0/c_33;
+		}
+	else
+		{
+		c_33 = 0.0;
+		tmp = 0.0;
+		}
+	inv_diag_D[3] = tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+	{
+	double alpha = 1.0;
+	double beta = 1.0;
+	kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+	tmp = inv_diag_E[1];
+	c_01 *= tmp;
+	c_11 *= tmp;
+	c_21 *= tmp;
+	c_31 *= tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+	tmp = inv_diag_E[2];
+	c_02 *= tmp;
+	c_12 *= tmp;
+	c_22 *= tmp;
+	c_32 *= tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+	tmp = inv_diag_E[3];
+	c_03 *= tmp;
+	c_13 *= tmp;
+	c_23 *= tmp;
+	c_33 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	double alpha = 1.0;
+	double beta  = 1.0;
+	kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	double
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+
+	if(kn>3)
+		{
+		tmp = inv_diag_E[3];
+		c_03 *= tmp;
+		c_13 *= tmp;
+		c_23 *= tmp;
+		c_33 *= tmp;
+		tmp = E[2+bs*3];
+		c_02 -= c_03 * tmp;
+		c_12 -= c_13 * tmp;
+		c_22 -= c_23 * tmp;
+		c_32 -= c_33 * tmp;
+		tmp = E[1+bs*3];
+		c_01 -= c_03 * tmp;
+		c_11 -= c_13 * tmp;
+		c_21 -= c_23 * tmp;
+		c_31 -= c_33 * tmp;
+		tmp = E[0+bs*3];
+		c_00 -= c_03 * tmp;
+		c_10 -= c_13 * tmp;
+		c_20 -= c_23 * tmp;
+		c_30 -= c_33 * tmp;
+		}
+
+	if(kn>2)
+		{
+		tmp = inv_diag_E[2];
+		c_02 *= tmp;
+		c_12 *= tmp;
+		c_22 *= tmp;
+		c_32 *= tmp;
+		tmp = E[1+bs*2];
+		c_01 -= c_02 * tmp;
+		c_11 -= c_12 * tmp;
+		c_21 -= c_22 * tmp;
+		c_31 -= c_32 * tmp;
+		tmp = E[0+bs*2];
+		c_00 -= c_02 * tmp;
+		c_10 -= c_12 * tmp;
+		c_20 -= c_22 * tmp;
+		c_30 -= c_32 * tmp;
+		}
+
+	if(kn>1)
+		{
+		tmp = inv_diag_E[1];
+		c_01 *= tmp;
+		c_11 *= tmp;
+		c_21 *= tmp;
+		c_31 *= tmp;
+		tmp = E[0+bs*1];
+		c_00 -= c_01 * tmp;
+		c_10 -= c_11 * tmp;
+		c_20 -= c_21 * tmp;
+		c_30 -= c_31 * tmp;
+		}
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// factorization
+
+	// first column
+	tmp = 1.0 / c_00;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+
+	// second column
+	c_11 -= c_10 * c_01;
+	c_21 -= c_20 * c_01;
+	c_31 -= c_30 * c_01;
+
+	tmp = 1.0 / c_11;
+	c_21 *= tmp;
+	c_31 *= tmp;
+	
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+
+	// third column
+	c_12 -= c_10 * c_02;
+	c_22 -= c_20 * c_02;
+	c_32 -= c_30 * c_02;
+
+	c_22 -= c_21 * c_12;
+	c_32 -= c_31 * c_12;
+
+	tmp = 1.0 / c_22;
+	c_32 *= tmp;
+
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+
+	// fourth column
+	c_13 -= c_10 * c_03;
+	c_23 -= c_20 * c_03;
+	c_33 -= c_30 * c_03;
+
+	c_23 -= c_21 * c_13;
+	c_33 -= c_31 * c_13;
+
+	c_33 -= c_32 * c_23;
+
+	tmp = 1.0 / c_33;
+
+	inv_diag_D[3] = tmp;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+	{
+	kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_1, e_2, e_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// solution
+
+	if(km==1)
+		goto store;
+	
+	e_1 = E[1+bs*0];
+	e_2 = E[2+bs*0];
+	e_3 = E[3+bs*0];
+	c_10 -= e_1 * c_00;
+	c_20 -= e_2 * c_00;
+	c_30 -= e_3 * c_00;
+	c_11 -= e_1 * c_01;
+	c_21 -= e_2 * c_01;
+	c_31 -= e_3 * c_01;
+	c_12 -= e_1 * c_02;
+	c_22 -= e_2 * c_02;
+	c_32 -= e_3 * c_02;
+	c_13 -= e_1 * c_03;
+	c_23 -= e_2 * c_03;
+	c_33 -= e_3 * c_03;
+
+	if(km==2)
+		goto store;
+	
+	e_2 = E[2+bs*1];
+	e_3 = E[3+bs*1];
+	c_20 -= e_2 * c_10;
+	c_30 -= e_3 * c_10;
+	c_21 -= e_2 * c_11;
+	c_31 -= e_3 * c_11;
+	c_22 -= e_2 * c_12;
+	c_32 -= e_3 * c_12;
+	c_23 -= e_2 * c_13;
+	c_33 -= e_3 * c_13;
+
+	if(km==3)
+		goto store;
+	
+	e_3 = E[3+bs*2];
+	c_30 -= e_3 * c_20;
+	c_31 -= e_3 * c_21;
+	c_32 -= e_3 * c_22;
+	c_33 -= e_3 * c_23;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+	{
+	kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+	
+	// solve
+
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_10 *= e_00;
+	c_20 *= e_00;
+	c_30 *= e_00;
+
+	if(kn==1)
+		goto store;
+	
+	e_01 = E[0+bs*1];
+	e_11 = inv_diag_E[1];
+	c_01 -= c_00 * e_01;
+	c_11 -= c_10 * e_01;
+	c_21 -= c_20 * e_01;
+	c_31 -= c_30 * e_01;
+	c_01 *= e_11;
+	c_11 *= e_11;
+	c_21 *= e_11;
+	c_31 *= e_11;
+
+	if(kn==2)
+		goto store;
+	
+	e_02 = E[0+bs*2];
+	e_12 = E[1+bs*2];
+	e_22 = inv_diag_E[2];
+	c_02 -= c_00 * e_02;
+	c_12 -= c_10 * e_02;
+	c_22 -= c_20 * e_02;
+	c_32 -= c_30 * e_02;
+	c_02 -= c_01 * e_12;
+	c_12 -= c_11 * e_12;
+	c_22 -= c_21 * e_12;
+	c_32 -= c_31 * e_12;
+	c_02 *= e_22;
+	c_12 *= e_22;
+	c_22 *= e_22;
+	c_32 *= e_22;
+
+	if(kn==3)
+		goto store;
+	
+	e_03 = E[0+bs*3];
+	e_13 = E[1+bs*3];
+	e_23 = E[2+bs*3];
+	e_33 = inv_diag_E[3];
+	c_03 -= c_00 * e_03;
+	c_13 -= c_10 * e_03;
+	c_23 -= c_20 * e_03;
+	c_33 -= c_30 * e_03;
+	c_03 -= c_01 * e_13;
+	c_13 -= c_11 * e_13;
+	c_23 -= c_21 * e_13;
+	c_33 -= c_31 * e_13;
+	c_03 -= c_02 * e_23;
+	c_13 -= c_12 * e_23;
+	c_23 -= c_22 * e_23;
+	c_33 -= c_32 * e_23;
+	c_03 *= e_33;
+	c_13 *= e_33;
+	c_23 *= e_33;
+	c_33 *= e_33;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+//	printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+//	printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+//	printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+//	printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+	
+	// solve
+
+	if(km>3)
+		{
+		e_03 = E[0+bs*3];
+		e_13 = E[1+bs*3];
+		e_23 = E[2+bs*3];
+		e_33 = inv_diag_E[3];
+		c_30 *= e_33;
+		c_31 *= e_33;
+		c_32 *= e_33;
+		c_33 *= e_33;
+		c_00 -= e_03 * c_30;
+		c_01 -= e_03 * c_31;
+		c_02 -= e_03 * c_32;
+		c_03 -= e_03 * c_33;
+		c_10 -= e_13 * c_30;
+		c_11 -= e_13 * c_31;
+		c_12 -= e_13 * c_32;
+		c_13 -= e_13 * c_33;
+		c_20 -= e_23 * c_30;
+		c_21 -= e_23 * c_31;
+		c_22 -= e_23 * c_32;
+		c_23 -= e_23 * c_33;
+		}
+	
+	if(km>2)
+		{
+		e_02 = E[0+bs*2];
+		e_12 = E[1+bs*2];
+		e_22 = inv_diag_E[2];
+		c_20 *= e_22;
+		c_21 *= e_22;
+		c_22 *= e_22;
+		c_23 *= e_22;
+		c_00 -= e_02 * c_20;
+		c_01 -= e_02 * c_21;
+		c_02 -= e_02 * c_22;
+		c_03 -= e_02 * c_23;
+		c_10 -= e_12 * c_20;
+		c_11 -= e_12 * c_21;
+		c_12 -= e_12 * c_22;
+		c_13 -= e_12 * c_23;
+		}
+	
+	if(km>1)
+		{
+		e_01 = E[0+bs*1];
+		e_11 = inv_diag_E[1];
+		c_10 *= e_11;
+		c_11 *= e_11;
+		c_12 *= e_11;
+		c_13 *= e_11;
+		c_00 -= e_01 * c_10;
+		c_01 -= e_01 * c_11;
+		c_02 -= e_01 * c_12;
+		c_03 -= e_01 * c_13;
+		}
+	
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_01 *= e_00;
+	c_02 *= e_00;
+	c_03 *= e_00;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+	{
+	kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
diff --git a/kernel/c99/kernel_dgemm_diag_lib4.c b/kernel/c99/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..cad2b21
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,1111 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_0;
+		c_2 = a_2 * b_0;
+		c_3 = a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = a_0 * b_1;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_1;
+		c_3 = a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = a_0 * b_2;
+		c_1 = a_1 * b_2;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = a_0 * b_3;
+		c_1 = a_1 * b_3;
+		c_2 = a_2 * b_3;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1, a_2,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0, a_1,
+		b_0, b_1,
+		c_0, c_1;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+#endif
+
+
diff --git a/kernel/c99/kernel_dgemv_4_lib4.c b/kernel/c99/kernel_dgemv_4_lib4.c
new file mode 100644
index 0000000..9f11b5f
--- /dev/null
+++ b/kernel/c99/kernel_dgemv_4_lib4.c
@@ -0,0 +1,1009 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		x_0,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		x_0 = x[1];
+
+		y_0 += A[0+bs*1] * x_0;
+		y_1 += A[1+bs*1] * x_0;
+		y_2 += A[2+bs*1] * x_0;
+		y_3 += A[3+bs*1] * x_0;
+		
+		x_0 = x[2];
+
+		y_0 += A[0+bs*2] * x_0;
+		y_1 += A[1+bs*2] * x_0;
+		y_2 += A[2+bs*2] * x_0;
+		y_3 += A[3+bs*2] * x_0;
+		
+		x_0 = x[3];
+
+		y_0 += A[0+bs*3] * x_0;
+		y_1 += A[1+bs*3] * x_0;
+		y_2 += A[2+bs*3] * x_0;
+		y_3 += A[3+bs*3] * x_0;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(k0<=0 & k1>3)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		if(k0<=0 & k1>0) z[0] = y_0;
+		if(k0<=1 & k1>1) z[1] = y_1;
+		if(k0<=2 & k1>2) z[2] = y_2;
+		if(k0<=3 & k1>3) z[3] = y_3;
+		}
+
+	}
+#endif
+	
+	
+	
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_vs_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1)
+	{
+
+	kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z)
+	{
+
+	kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int k, kend;
+	
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	if(offA!=0) // 1, 2, 3
+		{
+		kend = 4-offA<kmax ? 4-offA : kmax;
+		for(; k<kend; k++)
+			{
+			
+			x_0 = x[0];
+		
+			y_0 += A[0+bs*0] * x_0;
+			y_1 += A[0+bs*1] * x_0;
+			y_2 += A[0+bs*2] * x_0;
+			y_3 += A[0+bs*3] * x_0;
+		
+			A += 1;
+			x += 1;
+			
+			}
+		A += bs*(sda-1);
+		}
+	for(; k<kmax-bs+1; k+=bs)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+	
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+	
+		A += 1;
+		x += 1;
+		
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z)
+	{
+
+	kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_vs_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1)
+	{
+
+	kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_vs_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[1+bs*0] * x_0;
+		y_2 -= A[2+bs*0] * x_0;
+		y_3 -= A[3+bs*0] * x_0;
+
+		y_0 -= A[0+bs*1] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[2+bs*1] * x_1;
+		y_3 -= A[3+bs*1] * x_1;
+
+		y_0 -= A[0+bs*2] * x_2;
+		y_1 -= A[1+bs*2] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[3+bs*2] * x_2;
+
+		y_0 -= A[0+bs*3] * x_3;
+		y_1 -= A[1+bs*3] * x_3;
+		y_2 -= A[2+bs*3] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	double
+		a_00, a_10, a_20, a_30,
+		a_11, a_21, a_31;
+	
+	// a_00
+	a_00 = inv_diag_A[0];
+	a_10 = A[1+bs*0];
+	a_20 = A[2+bs*0];
+	a_30 = A[3+bs*0];
+	y_0 *= a_00;
+	z[0] = y_0;
+	y_1 -= a_10 * y_0;
+	y_2 -= a_20 * y_0;
+	y_3 -= a_30 * y_0;
+
+	if(kn==1)
+		{
+		if(km==1)
+			return;
+		y[1] = y_1;
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_11
+	a_11 = inv_diag_A[1];
+	a_21 = A[2+bs*1];
+	a_31 = A[3+bs*1];
+	y_1 *= a_11;	
+	z[1] = y_1;
+	y_2 -= a_21 * y_1;
+	y_3 -= a_31 * y_1;
+
+	if(kn==2)
+		{
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_22
+	a_00 = inv_diag_A[2];
+	a_10 = A[3+bs*2];
+	y_2 *= a_00;
+	z[2] = y_2;
+	y_3 -= a_10 * y_2;
+
+	if(kn==3)
+		{
+		if(km==3)
+			return;
+		y[3] = y_3;
+
+		return;
+		}
+
+	// a_33
+	a_11 = inv_diag_A[3];
+	y_3 *= a_11;	
+	z[3] = y_3;
+
+	}
+#endif
+	
+
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	kernel_dtrsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+	}
+#endif
+	
+	
+		
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_4_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=4;
+	A += 4 + (sda-1)*bs;
+	x += 4;
+	for(; k<kmax-3; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[1+bs*2] * x_1;
+		y_3 -= A[1+bs*3] * x_1;
+		
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[2+bs*3] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+	
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_3 *= inv_diag_A[3];
+	z[3] = y_3;
+
+	y_2 -= A[3+bs*2] * y_3;
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+	y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_3_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0;
+	
+	k = 3;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_3 = x[3];
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+			y_2 -= A[0+bs*2] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			y_2 -= A[1+bs*2] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+			y_2 -= A[2+bs*2] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			y_2 -= A[3+bs*2] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 3;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2;
+	y_1 -= A[2+bs*1]*y_2;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_2_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0;
+	
+	k = 2;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 2;
+		x += 2;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_1_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	double *tA, *tx;
+	tA = A;
+	tx = x;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0;
+	
+	k = 1;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_0 -= A[2+bs*0] * x_2;
+		y_0 -= A[3+bs*0] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_0 -= A[1+bs*0] * x_1;
+			y_0 -= A[2+bs*0] * x_2;
+			y_0 -= A[3+bs*0] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 1;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_un_4_lib4(int kmax, double *A, double *x, double *z)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+
+	y_0 += A[0+bs*0] * x_0;
+/*	y_1 += A[1+bs*0] * x_0;*/
+/*	y_2 += A[2+bs*0] * x_0;*/
+/*	y_3 += A[3+bs*0] * x_0;*/
+
+	y_0 += A[0+bs*1] * x_1;
+	y_1 += A[1+bs*1] * x_1;
+/*	y_2 += A[2+bs*1] * x_1;*/
+/*	y_3 += A[3+bs*1] * x_1;*/
+
+	y_0 += A[0+bs*2] * x_2;
+	y_1 += A[1+bs*2] * x_2;
+	y_2 += A[2+bs*2] * x_2;
+/*	y_3 += A[3+bs*2] * x_2;*/
+
+	y_0 += A[0+bs*3] * x_3;
+	y_1 += A[1+bs*3] * x_3;
+	y_2 += A[2+bs*3] * x_3;
+	y_3 += A[3+bs*3] * x_3;
+	
+	A += 4*bs;
+	x += 4;
+
+	k=4;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+
+		y_0 += A[0+bs*1] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[2+bs*1] * x_1;
+		y_3 += A[3+bs*1] * x_1;
+
+		y_0 += A[0+bs*2] * x_2;
+		y_1 += A[1+bs*2] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[3+bs*2] * x_2;
+
+		y_0 += A[0+bs*3] * x_3;
+		y_1 += A[1+bs*3] * x_3;
+		y_2 += A[2+bs*3] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	z[0] = y_0;
+	z[1] = y_1;
+	z[2] = y_2;
+	z[3] = y_3;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_vs_lib4(int kmax, double *A, int sda, double *x, double *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int
+		k;
+	
+	double
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-4; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+	
+	y_0 += A[0+bs*0] * x_0;
+	y_1 += A[0+bs*1] * x_0;
+	y_2 += A[0+bs*2] * x_0;
+	y_3 += A[0+bs*3] * x_0;
+
+/*	y_0 += A[1+bs*0] * x_1;*/
+	y_1 += A[1+bs*1] * x_1;
+	y_2 += A[1+bs*2] * x_1;
+	y_3 += A[1+bs*3] * x_1;
+	
+/*	y_0 += A[2+bs*0] * x_2;*/
+/*	y_1 += A[2+bs*1] * x_2;*/
+	y_2 += A[2+bs*2] * x_2;
+	y_3 += A[2+bs*3] * x_2;
+
+/*	y_0 += A[3+bs*0] * x_3;*/
+/*	y_1 += A[3+bs*1] * x_3;*/
+/*	y_2 += A[3+bs*2] * x_3;*/
+	y_3 += A[3+bs*3] * x_3;
+	
+//	A += sda*bs;
+//	x += 4;
+
+	// store_vs
+	store:
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_lib4(int kmax, double *A, int sda, double *x, double *z)
+	{
+	
+	kernel_dtrmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+	}
+#endif
+
+
+
+
+
diff --git a/kernel/c99/kernel_dgeqrf_4_lib4.c b/kernel/c99/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..071ec86
--- /dev/null
+++ b/kernel/c99/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2620 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	ii = 1;
+	if(m>1)
+		{
+		tmp = pD[1+ps*0];
+		beta += tmp*tmp;
+		if(m>2)
+			{
+			tmp = pD[2+ps*0];
+			beta += tmp*tmp;
+			if(m>3)
+				{
+				tmp = pD[3+ps*0];
+				beta += tmp*tmp;
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*0];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		ii = 1;
+		if(m>1)
+			{
+			pD[1+ps*0] *= tmp;
+			if(m>2)
+				{
+				pD[2+ps*0] *= tmp;
+				if(m>3)
+					{
+					pD[3+ps*0] *= tmp;
+					}
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*0] *= tmp;
+			pD[1+ii*sdd+ps*0] *= tmp;
+			pD[2+ii*sdd+ps*0] *= tmp;
+			pD[3+ii*sdd+ps*0] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*0] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[0+ps*1];
+	w2 = pD[0+ps*2];
+	w3 = pD[0+ps*3];
+	if(m>1)
+		{
+		w1 += pD[1+ps*1] * pD[1+ps*0];
+		w2 += pD[1+ps*2] * pD[1+ps*0];
+		w3 += pD[1+ps*3] * pD[1+ps*0];
+		if(m>2)
+			{
+			w1 += pD[2+ps*1] * pD[2+ps*0];
+			w2 += pD[2+ps*2] * pD[2+ps*0];
+			w3 += pD[2+ps*3] * pD[2+ps*0];
+			if(m>3)
+				{
+				w1 += pD[3+ps*1] * pD[3+ps*0];
+				w2 += pD[3+ps*2] * pD[3+ps*0];
+				w3 += pD[3+ps*3] * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[0+ps*1] += w1;
+	pD[0+ps*2] += w2;
+	pD[0+ps*3] += w3;
+	if(m>1)
+		{
+		pD[1+ps*1] += w1 * pD[1+ps*0];
+		pD[1+ps*2] += w2 * pD[1+ps*0];
+		pD[1+ps*3] += w3 * pD[1+ps*0];
+		if(m>2)
+			{
+			pD[2+ps*1] += w1 * pD[2+ps*0];
+			pD[2+ps*2] += w2 * pD[2+ps*0];
+			pD[2+ps*3] += w3 * pD[2+ps*0];
+			if(m>3)
+				{
+				pD[3+ps*1] += w1 * pD[3+ps*0];
+				pD[3+ps*2] += w2 * pD[3+ps*0];
+				pD[3+ps*3] += w3 * pD[3+ps*0];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+		}
+	if(m==1)
+		return;
+	// second column
+	beta = 0.0;
+	if(m>2)
+		{
+		tmp = pD[2+ps*1];
+		beta += tmp*tmp;
+		if(m>3)
+			{
+			tmp = pD[3+ps*1];
+			beta += tmp*tmp;
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*1];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		if(m>2)
+			{
+			pD[2+ps*1] *= tmp;
+			if(m>3)
+				{
+				pD[3+ps*1] *= tmp;
+				}
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*1] *= tmp;
+			pD[1+ii*sdd+ps*1] *= tmp;
+			pD[2+ii*sdd+ps*1] *= tmp;
+			pD[3+ii*sdd+ps*1] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*1] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[1+ps*2];
+	w3 = pD[1+ps*3];
+	if(m>2)
+		{
+		w2 += pD[2+ps*2] * pD[2+ps*1];
+		w3 += pD[2+ps*3] * pD[2+ps*1];
+		if(m>3)
+			{
+			w2 += pD[3+ps*2] * pD[3+ps*1];
+			w3 += pD[3+ps*3] * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[1+ps*2] += w2;
+	pD[1+ps*3] += w3;
+	if(m>2)
+		{
+		pD[2+ps*2] += w2 * pD[2+ps*1];
+		pD[2+ps*3] += w3 * pD[2+ps*1];
+		if(m>3)
+			{
+			pD[3+ps*2] += w2 * pD[3+ps*1];
+			pD[3+ps*3] += w3 * pD[3+ps*1];
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+		}
+	if(m==2)
+		return;
+	// third column
+	beta = 0.0;
+	if(m>3)
+		{
+		tmp = pD[3+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*2];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		if(m>3)
+			{
+			pD[3+ps*2] *= tmp;
+			}
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*2] *= tmp;
+			pD[1+ii*sdd+ps*2] *= tmp;
+			pD[2+ii*sdd+ps*2] *= tmp;
+			pD[3+ii*sdd+ps*2] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*2] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[2+ps*3];
+	if(m>3)
+		{
+		w3 += pD[3+ps*3] * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	w3 = - dD[2] * w3;
+	pD[2+ps*3] += w3;
+	if(m>3)
+		{
+		pD[3+ps*3] += w3 * pD[3+ps*2];
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+		pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+		pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+		pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+		}
+	if(m==3)
+		return;
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		tmp = pD[0+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[1+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[2+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		tmp = pD[3+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		tmp = pD[ll+ii*sdd+ps*3];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<m-3; ii+=4)
+			{
+			pD[0+ii*sdd+ps*3] *= tmp;
+			pD[1+ii*sdd+ps*3] *= tmp;
+			pD[2+ii*sdd+ps*3] *= tmp;
+			pD[3+ii*sdd+ps*3] *= tmp;
+			}
+		for(ll=0; ll<m-ii; ll++)
+			{
+			pD[ll+ii*sdd+ps*3] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k; //m<n ? m : n;
+	double alpha, beta, tmp, w0;
+	double *pC00, *pC10, *pC01, *pC11;
+	int offset;
+	double *pD0 = pD-offD;
+	for(ii=0; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		jmax = m-ii-1;
+		jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		offset = 0;
+		jj = 0;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				tmp = pC10[0+offset];
+				beta += tmp*tmp;
+				offset += 1;
+				}
+			offset += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			tmp = pC10[1+offset];
+			beta += tmp*tmp;
+			tmp = pC10[2+offset];
+			beta += tmp*tmp;
+			tmp = pC10[3+offset];
+			beta += tmp*tmp;
+			offset += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			tmp = pC10[0+offset];
+			beta += tmp*tmp;
+			offset += 1;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			offset = 0;
+			jj = 0;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					pC10[0+offset] *= tmp;
+					offset += 1;
+					}
+				offset += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				pC10[0+offset] *= tmp;
+				pC10[1+offset] *= tmp;
+				pC10[2+offset] *= tmp;
+				pC10[3+offset] *= tmp;
+				offset += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				pC10[0+offset] *= tmp;
+				offset += 1;
+				}
+			pC00[0] = beta;
+			}
+		if(ii<n)
+			{
+			pC01 = pC00 + ps;
+			pC11 = pC10 + ps;
+			kmax = jmax;
+			kmax0 = jmax0;
+			jmax = n-ii-1;
+			jj = 0;
+			for( ; jj<jmax; jj++)
+				{
+				w0 = pC01[0+ps*jj] * 1.0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+						offset += 1;
+						}
+					offset += -ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+					w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+					w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+					offset += 1;
+					}
+				w0 = - dD[ii] * w0;
+				pC01[0+ps*jj] += w0;
+				offset = 0;
+				kk = 0;
+				if(kmax0>0)
+					{
+					for( ; kk<kmax0; kk++)
+						{
+						pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+						offset += 1;
+						}
+					offset = offset-ps+ps*sdd;
+					}
+				for( ; kk<kmax-3; kk+=4)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+					pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+					pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+					offset += ps*sdd;
+					}
+				for(ll=0; ll<kmax-kk; ll++)
+					{
+					pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+					offset += 1;
+					}
+				}
+			}
+		}
+	return;
+	}
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 2;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[1+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pD[1+ps*0];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] = tmp;
+			tmp = pC[1+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pD[2+ps*0];
+				d1 = pD[2+ps*1];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * d0;
+				pW[0+ldw*1] += tmp * d1;
+				pW[0+ldw*2] = tmp;
+				tmp = pC[2+ps*1];
+				pW[1+ldw*0] += tmp * d0;
+				pW[1+ldw*1] += tmp * d1;
+				pW[1+ldw*2] = tmp;
+				if(m>3)
+					{
+					d0 = pD[3+ps*0];
+					d1 = pD[3+ps*1];
+					d2 = pD[3+ps*2];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * d0;
+					pW[0+ldw*1] += tmp * d1;
+					pW[0+ldw*2] += tmp * d2;
+					pW[0+ldw*3] = tmp;
+					tmp = pC[3+ps*1];
+					pW[1+ldw*0] += tmp * d0;
+					pW[1+ldw*1] += tmp * d1;
+					pW[1+ldw*2] += tmp * d2;
+					pW[1+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * d0;
+			pW[0+ldw*1] += tmp * d1;
+			pW[0+ldw*2] += tmp * d2;
+			pW[0+ldw*3] += tmp * d3;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[1+ldw*0] += tmp * d0;
+			pW[1+ldw*1] += tmp * d1;
+			pW[1+ldw*2] += tmp * d2;
+			pW[1+ldw*3] += tmp * d3;
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		pC[0+ps*1] -= pW[1+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pD[0+jj*sdd+ps*0];
+			d1 = pD[0+jj*sdd+ps*1];
+			d2 = pD[0+jj*sdd+ps*2];
+			d3 = pD[0+jj*sdd+ps*3];
+			pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[1+jj*sdd+ps*0];
+			d1 = pD[1+jj*sdd+ps*1];
+			d2 = pD[1+jj*sdd+ps*2];
+			d3 = pD[1+jj*sdd+ps*3];
+			pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[2+jj*sdd+ps*0];
+			d1 = pD[2+jj*sdd+ps*1];
+			d2 = pD[2+jj*sdd+ps*2];
+			d3 = pD[2+jj*sdd+ps*3];
+			pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			//
+			d0 = pD[3+jj*sdd+ps*0];
+			d1 = pD[3+jj*sdd+ps*1];
+			d2 = pD[3+jj*sdd+ps*2];
+			d3 = pD[3+jj*sdd+ps*3];
+			pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pD[ll+jj*sdd+ps*0];
+			d1 = pD[ll+jj*sdd+ps*1];
+			d2 = pD[ll+jj*sdd+ps*2];
+			d3 = pD[ll+jj*sdd+ps*3];
+			pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+			pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+ps*0];
+			pW[0+ldw*1] = tmp;
+			if(m>2)
+				{
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += tmp * pD[2+ps*0];
+				pW[0+ldw*1] += tmp * pD[2+ps*1];
+				pW[0+ldw*2] = tmp;
+				if(m>3)
+					{
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += tmp * pD[3+ps*0];
+					pW[0+ldw*1] += tmp * pD[3+ps*1];
+					pW[0+ldw*2] += tmp * pD[3+ps*2];
+					pW[0+ldw*3] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+			pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+			pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+			pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+			}
+		// compute W^T *= T
+		pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+		pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+		pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		pC[0+ps*0] -= pW[0+ldw*0];
+		if(m>1)
+			{
+			pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+			if(m>2)
+				{
+				pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+				if(m>3)
+					{
+					pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+			pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+			}
+		}
+
+	return;
+	}
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, ll;
+	const int ps = 4;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	double c00, c01,
+	       c10, c11,
+	       c20, c21,
+	       c30, c31;
+	double a0, a1, a2, a3, b0, b1;
+	double tmp, d0, d1, d2, d3;
+	double *pC;
+	double pT[16];// = {};
+	int ldt = 4;
+	double pW[8];// = {};
+	int ldw = 4;
+	// dot product of v
+	v10 = 0.0;
+	v20 = 0.0;
+	v30 = 0.0;
+	v21 = 0.0;
+	v31 = 0.0;
+	v32 = 0.0;
+	if(m>1)
+		{
+		v10 = 1.0 * pD[1+ps*0];
+		if(m>2)
+			{
+			v10 += pD[2+ps*1] * pD[2+ps*0];
+			v20 = 1.0 * pD[2+ps*0];
+			v21 = 1.0 * pD[2+ps*1];
+			if(m>3)
+				{
+				v10 += pD[3+ps*1] * pD[3+ps*0];
+				v20 += pD[3+ps*2] * pD[3+ps*0];
+				v21 += pD[3+ps*2] * pD[3+ps*1];
+				v30 = 1.0 * pD[3+ps*0];
+				v31 = 1.0 * pD[3+ps*1];
+				v32 = 1.0 * pD[3+ps*2];
+				}
+			}
+		}
+	for(ii=4; ii<m-3; ii+=4)
+		{
+		v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+		v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+		v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+		v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+		v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+		v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+		v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+		v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+		v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+		v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+		v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+		v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+		v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+		v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+		v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+		v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+		v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+		v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+		v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+		v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+		v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+		v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+		v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+		v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+		}
+	for(ll=0; ll<m-ii; ll++)
+		{
+		v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+		v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+		v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+		v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+		v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+		v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+		}
+	// compute lower triangular T containing tau for matrix update
+	pT[0+ldt*0] = dD[0];
+	pT[1+ldt*1] = dD[1];
+	pT[2+ldt*2] = dD[2];
+	pT[3+ldt*3] = dD[3];
+	pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+	pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+	pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+	pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+	pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+	pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+	// downgrade matrix
+	pW[0] = 0.0;
+	pW[1] = 0.0;
+	pW[2] = 0.0;
+	pW[3] = 0.0;
+	pW[4] = 0.0;
+	pW[5] = 0.0;
+	pW[6] = 0.0;
+	pW[7] = 0.0;
+	ii = 0;
+	for( ; ii<n-1; ii+=2)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		tmp = pC[0+ps*1];
+		pW[0+ldw*1] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] = tmp;
+			tmp = pC[1+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += d0 * tmp;
+				pW[1+ldw*0] += d1 * tmp;
+				pW[2+ldw*0] = tmp;
+				tmp = pC[2+ps*1];
+				pW[0+ldw*1] += d0 * tmp;
+				pW[1+ldw*1] += d1 * tmp;
+				pW[2+ldw*1] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += d0 * tmp;
+					pW[1+ldw*0] += d1 * tmp;
+					pW[2+ldw*0] += d2 * tmp;
+					pW[3+ldw*0] = tmp;
+					tmp = pC[3+ps*1];
+					pW[0+ldw*1] += d0 * tmp;
+					pW[1+ldw*1] += d1 * tmp;
+					pW[2+ldw*1] += d2 * tmp;
+					pW[3+ldw*1] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[0+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[1+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[2+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[3+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			tmp = pC[ll+jj*sdc+ps*1];
+			pW[0+ldw*1] += d0 * tmp;
+			pW[1+ldw*1] += d1 * tmp;
+			pW[2+ldw*1] += d2 * tmp;
+			pW[3+ldw*1] += d3 * tmp;
+			}
+		// compute W^T *= T
+		pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+		pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
+		pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+		pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
+		pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+		pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		c01 = pC[0+jj*sdc+ps*1];
+		c11 = pC[1+jj*sdc+ps*1];
+		c21 = pC[2+jj*sdc+ps*1];
+		c31 = pC[3+jj*sdc+ps*1];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ldw*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[0+ldw*1];
+		c01 -= b1;
+		c11 -= a1*b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ldw*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		b1 = pW[1+ldw*1];
+		c11 -= b1;
+		c21 -= a2*b1;
+		c31 -= a3*b1;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ldw*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		b1 = pW[2+ldw*1];
+		c21 -= b1;
+		c31 -= a3*b1;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ldw*0];
+		c30 -= b0;
+		b1 = pW[3+ldw*1];
+		c31 -= b1;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		pC[0+jj*sdc+ps*1] = c01;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[1+jj*sdc+ps*1] = c11;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				pC[2+jj*sdc+ps*1] = c21;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					pC[3+jj*sdc+ps*1] = c31;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			c01 = pC[0+jj*sdc+ps*1];
+			c11 = pC[1+jj*sdc+ps*1];
+			c21 = pC[2+jj*sdc+ps*1];
+			c31 = pC[3+jj*sdc+ps*1];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[0+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[1+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[2+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			b1 = pW[3+ldw*1];
+			c01 -= a0*b1;
+			c11 -= a1*b1;
+			c21 -= a2*b1;
+			c31 -= a3*b1;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			pC[0+jj*sdc+ps*1] = c01;
+			pC[1+jj*sdc+ps*1] = c11;
+			pC[2+jj*sdc+ps*1] = c21;
+			pC[3+jj*sdc+ps*1] = c31;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			c01 = pC[ll+jj*sdc+ps*1];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[0+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[1+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[2+ldw*1];
+			c01 -= a0*b1;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			b1 = pW[3+ldw*1];
+			c01 -= a0*b1;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			pC[ll+jj*sdc+ps*1] = c01;
+			}
+		}
+	for( ; ii<n; ii++)
+		{
+		pC = pC0+ii*ps;
+		// compute W^T = C^T * V
+		tmp = pC[0+ps*0];
+		pW[0+ldw*0] = tmp;
+		if(m>1)
+			{
+			d0 = pVt[0+ps*1];
+			tmp = pC[1+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] = tmp;
+			if(m>2)
+				{
+				d0 = pVt[0+ps*2];
+				d1 = pVt[1+ps*2];
+				tmp = pC[2+ps*0];
+				pW[0+ldw*0] += d0 * tmp;
+				pW[1+ldw*0] += d1 * tmp;
+				pW[2+ldw*0] = tmp;
+				if(m>3)
+					{
+					d0 = pVt[0+ps*3];
+					d1 = pVt[1+ps*3];
+					d2 = pVt[2+ps*3];
+					tmp = pC[3+ps*0];
+					pW[0+ldw*0] += d0 * tmp;
+					pW[1+ldw*0] += d1 * tmp;
+					pW[2+ldw*0] += d2 * tmp;
+					pW[3+ldw*0] = tmp;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			//
+			d0 = pVt[0+ps*(0+jj)];
+			d1 = pVt[1+ps*(0+jj)];
+			d2 = pVt[2+ps*(0+jj)];
+			d3 = pVt[3+ps*(0+jj)];
+			tmp = pC[0+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(1+jj)];
+			d1 = pVt[1+ps*(1+jj)];
+			d2 = pVt[2+ps*(1+jj)];
+			d3 = pVt[3+ps*(1+jj)];
+			tmp = pC[1+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(2+jj)];
+			d1 = pVt[1+ps*(2+jj)];
+			d2 = pVt[2+ps*(2+jj)];
+			d3 = pVt[3+ps*(2+jj)];
+			tmp = pC[2+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			//
+			d0 = pVt[0+ps*(3+jj)];
+			d1 = pVt[1+ps*(3+jj)];
+			d2 = pVt[2+ps*(3+jj)];
+			d3 = pVt[3+ps*(3+jj)];
+			tmp = pC[3+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			d0 = pVt[0+ps*(ll+jj)];
+			d1 = pVt[1+ps*(ll+jj)];
+			d2 = pVt[2+ps*(ll+jj)];
+			d3 = pVt[3+ps*(ll+jj)];
+			tmp = pC[ll+jj*sdc+ps*0];
+			pW[0+ldw*0] += d0 * tmp;
+			pW[1+ldw*0] += d1 * tmp;
+			pW[2+ldw*0] += d2 * tmp;
+			pW[3+ldw*0] += d3 * tmp;
+			}
+		// compute W^T *= T
+		pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+		pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+		pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+		pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+		// compute C -= V * W^T
+		jj = 0;
+		// load
+		c00 = pC[0+jj*sdc+ps*0];
+		c10 = pC[1+jj*sdc+ps*0];
+		c20 = pC[2+jj*sdc+ps*0];
+		c30 = pC[3+jj*sdc+ps*0];
+		// rank1
+		a1 = pD[1+jj*sdd+ps*0];
+		a2 = pD[2+jj*sdd+ps*0];
+		a3 = pD[3+jj*sdd+ps*0];
+		b0 = pW[0+ldw*0];
+		c00 -= b0;
+		c10 -= a1*b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank2
+		a2 = pD[2+jj*sdd+ps*1];
+		a3 = pD[3+jj*sdd+ps*1];
+		b0 = pW[1+ldw*0];
+		c10 -= b0;
+		c20 -= a2*b0;
+		c30 -= a3*b0;
+		// rank3
+		a3 = pD[3+jj*sdd+ps*2];
+		b0 = pW[2+ldw*0];
+		c20 -= b0;
+		c30 -= a3*b0;
+		// rank4
+		a3 = pD[3+jj*sdd+ps*3];
+		b0 = pW[3+ldw*0];
+		c30 -= b0;
+		// store
+		pC[0+jj*sdc+ps*0] = c00;
+		if(m>1)
+			{
+			pC[1+jj*sdc+ps*0] = c10;
+			if(m>2)
+				{
+				pC[2+jj*sdc+ps*0] = c20;
+				if(m>3)
+					{
+					pC[3+jj*sdc+ps*0] = c30;
+					}
+				}
+			}
+		for(jj=4; jj<m-3; jj+=4)
+			{
+			// load
+			c00 = pC[0+jj*sdc+ps*0];
+			c10 = pC[1+jj*sdc+ps*0];
+			c20 = pC[2+jj*sdc+ps*0];
+			c30 = pC[3+jj*sdc+ps*0];
+			//
+			a0 = pD[0+jj*sdd+ps*0];
+			a1 = pD[1+jj*sdd+ps*0];
+			a2 = pD[2+jj*sdd+ps*0];
+			a3 = pD[3+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*1];
+			a1 = pD[1+jj*sdd+ps*1];
+			a2 = pD[2+jj*sdd+ps*1];
+			a3 = pD[3+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*2];
+			a1 = pD[1+jj*sdd+ps*2];
+			a2 = pD[2+jj*sdd+ps*2];
+			a3 = pD[3+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			//
+			a0 = pD[0+jj*sdd+ps*3];
+			a1 = pD[1+jj*sdd+ps*3];
+			a2 = pD[2+jj*sdd+ps*3];
+			a3 = pD[3+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			c10 -= a1*b0;
+			c20 -= a2*b0;
+			c30 -= a3*b0;
+			// store
+			pC[0+jj*sdc+ps*0] = c00;
+			pC[1+jj*sdc+ps*0] = c10;
+			pC[2+jj*sdc+ps*0] = c20;
+			pC[3+jj*sdc+ps*0] = c30;
+			}
+		for(ll=0; ll<m-jj; ll++)
+			{
+			// load
+			c00 = pC[ll+jj*sdc+ps*0];
+			//
+			a0 = pD[ll+jj*sdd+ps*0];
+			b0 = pW[0+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*1];
+			b0 = pW[1+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*2];
+			b0 = pW[2+ldw*0];
+			c00 -= a0*b0;
+			//
+			a0 = pD[ll+jj*sdd+ps*3];
+			b0 = pW[3+ldw*0];
+			c00 -= a0*b0;
+			// store
+			pC[ll+jj*sdc+ps*0] = c00;
+			}
+		}
+
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w1, w2, w3;
+	const int ps = 4;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[0] = 0.0;
+		}
+	else
+		{
+		alpha = pD[0+ps*0];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[0] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[0+ps*0] = beta;
+		for(ii=1; ii<n; ii++)
+			{
+			pD[0+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		}
+	// second column
+	beta = 0.0;
+	for(ii=2; ii<n; ii++)
+		{
+		tmp = pD[1+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[1] = 0.0;
+		}
+	else
+		{
+		alpha = pD[1+ps*1];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[1] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[1+ps*1] = beta;
+		for(ii=2; ii<n; ii++)
+			{
+			pD[1+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		}
+	// third column
+	beta = 0.0;
+	for(ii=3; ii<n; ii++)
+		{
+		tmp = pD[2+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[2] = 0.0;
+		}
+	else
+		{
+		alpha = pD[2+ps*2];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[2] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[2+ps*2] = beta;
+		for(ii=3; ii<n; ii++)
+			{
+			pD[2+ps*ii] *= tmp;
+			}
+		}
+	// gemv_t & ger
+	w3 = pD[3+ps*2];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	w3 = - dD[2] * w3;
+	pD[3+ps*2] += w3;
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		}
+	// fourth column
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		tmp = pD[3+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		// tau
+		dD[3] = 0.0;
+		}
+	else
+		{
+		alpha = pD[3+ps*3];
+		beta += alpha*alpha;
+		beta = sqrt(beta);
+		if(alpha>0)
+			beta = -beta;
+		// tau0
+		dD[3] = (beta-alpha) / beta;
+		tmp = 1.0 / (alpha-beta);
+		// compute v0
+		pD[3+ps*3] = beta;
+		for(ii=4; ii<n; ii++)
+			{
+			pD[3+ps*ii] *= tmp;
+			}
+		}
+	return;
+	}
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+	{
+	if(m<=0 | n<=0)
+		return;
+	int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+	const int ps = 4;
+	imax = k;//m<n ? m : n;
+	double alpha, beta, tmp;
+	double w00, w01,
+		   w10, w11,
+		   w20, w21,
+		   w30, w31;
+	double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+	double pT[4];
+	int ldt = 2;
+	double *pD0 = pD-offD;
+	ii = 0;
+#if 1
+	for(; ii<imax-1; ii+=2)
+		{
+		// first row
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+		kmax = n-ii;
+		w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+			}
+		w00 = - w00*dD[ii];
+		pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+		for(kk=1; kk<kmax; kk++)
+			{
+			pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+			}
+		// second row
+		pC11 = pC10+ps*1;
+		beta = 0.0;
+		for(jj=1; jj<n-(ii+1); jj++)
+			{
+			tmp = pC11[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[(ii+1)] = 0.0;
+			}
+		else
+			{
+			alpha = pC11[0+ps*0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[(ii+1)] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC11[0+ps*0] = beta;
+			for(jj=1; jj<n-(ii+1); jj++)
+				pC11[0+ps*jj] *= tmp;
+			}
+		// compute T
+		kmax = n-ii;
+		tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+		for(kk=2; kk<kmax; kk++)
+			tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+		pT[0+ldt*0] = dD[ii+0];
+		pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
+		pT[1+ldt*1] = dD[ii+1];
+		// downgrade
+		kmax = n-ii;
+		jmax = m-ii-2;
+		jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+		jmax0 = jmax<jmax0 ? jmax : jmax0;
+		jj = 0;
+		pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+		pC20 = pC20a;
+		if(jmax0>0)
+			{
+			for( ; jj<jmax0; jj++)
+				{
+				w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+				w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+					w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+					}
+				w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+				w00 = - w00*pT[0+ldt*0];
+				pC20[0+ps*0] += w00*1.0          + w01*0.0;
+				pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+				for(kk=2; kk<kmax; kk++)
+					{
+					pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+					}
+				pC20 += 1;
+				}
+			pC20 += -ps+ps*sdd;
+			}
+		for( ; jj<jmax-3; jj+=4)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
+			w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
+			w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
+			w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
+			w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
+				w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
+				w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
+				w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
+				w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+			w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
+			w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
+			w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
+			w00 = - w00*pT[0+ldt*0];
+			w10 = - w10*pT[0+ldt*0];
+			w20 = - w20*pT[0+ldt*0];
+			w30 = - w30*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[1+ps*0] += w10*1.0          + w11*0.0;
+			pC20[2+ps*0] += w20*1.0          + w21*0.0;
+			pC20[3+ps*0] += w30*1.0          + w31*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
+			pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
+			pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
+				pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
+				pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
+				}
+			pC20 += ps*sdd;
+			}
+		for(ll=0; ll<jmax-jj; ll++)
+			{
+			w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+			w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+				w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+				}
+			w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+			w00 = - w00*pT[0+ldt*0];
+			pC20[0+ps*0] += w00*1.0          + w01*0.0;
+			pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+			for(kk=2; kk<kmax; kk++)
+				{
+				pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+				}
+			pC20 += 1;
+			}
+		}
+#endif
+	for(; ii<imax; ii++)
+		{
+		pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+		beta = 0.0;
+		for(jj=1; jj<n-ii; jj++)
+			{
+			tmp = pC00[0+ps*jj];
+			beta += tmp*tmp;
+			}
+		if(beta==0.0)
+			{
+			dD[ii] = 0.0;
+			}
+		else
+			{
+			alpha = pC00[0];
+			beta += alpha*alpha;
+			beta = sqrt(beta);
+			if(alpha>0)
+				beta = -beta;
+			dD[ii] = (beta-alpha) / beta;
+			tmp = 1.0 / (alpha-beta);
+			pC00[0] = beta;
+			for(jj=1; jj<n-ii; jj++)
+				pC00[0+ps*jj] *= tmp;
+			}
+		if(ii<n)
+			{
+			kmax = n-ii;
+			jmax = m-ii-1;
+			jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+			jmax0 = jmax<jmax0 ? jmax : jmax0;
+			jj = 0;
+			pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+			pC10 = pC10a;
+			if(jmax0>0)
+				{
+				for( ; jj<jmax0; jj++)
+					{
+					w00 = pC10[0+ps*0];
+					for(kk=1; kk<kmax; kk++)
+						{
+						w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+						}
+					w00 = - w00*dD[ii];
+					pC10[0+ps*0] += w00;
+					for(kk=1; kk<kmax; kk++)
+						{
+						pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+						}
+					pC10 += 1;
+					}
+				pC10 += -ps+ps*sdd;
+				}
+			for( ; jj<jmax-3; jj+=4)
+				{
+				w00 = pC10[0+ps*0];
+				w10 = pC10[1+ps*0];
+				w20 = pC10[2+ps*0];
+				w30 = pC10[3+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
+					w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
+					w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
+					w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
+					}
+				w00 = - w00*dD[ii];
+				w10 = - w10*dD[ii];
+				w20 = - w20*dD[ii];
+				w30 = - w30*dD[ii];
+				pC10[0+ps*0] += w00;
+				pC10[1+ps*0] += w10;
+				pC10[2+ps*0] += w20;
+				pC10[3+ps*0] += w30;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00*pC00[0+ps*kk];
+					pC10[1+ps*kk] += w10*pC00[0+ps*kk];
+					pC10[2+ps*kk] += w20*pC00[0+ps*kk];
+					pC10[3+ps*kk] += w30*pC00[0+ps*kk];
+					}
+				pC10 += ps*sdd;
+				}
+			for(ll=0; ll<jmax-jj; ll++)
+				{
+				w00 = pC10[0+ps*0];
+				for(kk=1; kk<kmax; kk++)
+					{
+					w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+					}
+				w00 = - w00*dD[ii];
+				pC10[0+ps*0] += w00;
+				for(kk=1; kk<kmax; kk++)
+					{
+					pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+					}
+				pC10 += 1;
+				}
+			}
+		}
+	return;
+	}
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+	{
+	const int ps = 4;
+	int kk;
+	double v10,
+	       v20, v21,
+		   v30, v31, v32;
+	// 0
+	// 1
+	v10 =  pD[0+ps*1];
+	// 2
+	v10 += pD[1+ps*2]*pD[0+ps*2];
+	v20 =  pD[0+ps*2];
+	v21 =  pD[1+ps*2];
+	// 3
+	v10 += pD[1+ps*3]*pD[0+ps*3];
+	v20 += pD[2+ps*3]*pD[0+ps*3];
+	v21 += pD[2+ps*3]*pD[1+ps*3];
+	v30 =  pD[0+ps*3];
+	v31 =  pD[1+ps*3];
+	v32 =  pD[2+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		v10 += pD[1+ps*kk]*pD[0+ps*kk];
+		v20 += pD[2+ps*kk]*pD[0+ps*kk];
+		v30 += pD[3+ps*kk]*pD[0+ps*kk];
+		v21 += pD[2+ps*kk]*pD[1+ps*kk];
+		v31 += pD[3+ps*kk]*pD[1+ps*kk];
+		v32 += pD[3+ps*kk]*pD[2+ps*kk];
+		}
+	pT[0+ps*0] = - dD[0];
+	pT[1+ps*1] = - dD[1];
+	pT[2+ps*2] = - dD[2];
+	pT[3+ps*3] = - dD[3];
+	pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+	pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+	pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+	pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+	pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+	return;
+	}
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+	{
+	int ii, jj, ll;
+	double alpha, beta, tmp, w0, w1, w2, w3;
+	const int ps = 4;
+	// zero tau matrix
+	for(ii=0; ii<16; ii++)
+		pT[ii] = 0.0;
+	// first column
+	beta = 0.0;
+	for(ii=1; ii<n; ii++)
+		{
+		tmp = pD[0+ps*ii];
+		beta += tmp*tmp;
+		}
+	if(beta==0.0)
+		{
+		dD[0] = 0.0;
+		tmp = 0.0;
+		goto col2;
+		}
+	alpha = pD[0+ps*0];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[0] = (beta-alpha) / beta;
+	pT[0+ps*0] = - dD[0];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[0+ps*0] = beta;
+	w1 = pD[1+ps*0];
+	w2 = pD[2+ps*0];
+	w3 = pD[3+ps*0];
+	//
+	pD[0+ps*1] *= tmp;
+	w1 += pD[1+ps*1] * pD[0+ps*1];
+	w2 += pD[2+ps*1] * pD[0+ps*1];
+	w3 += pD[3+ps*1] * pD[0+ps*1];
+	//
+	pD[0+ps*2] *= tmp;
+	w1 += pD[1+ps*2] * pD[0+ps*2];
+	w2 += pD[2+ps*2] * pD[0+ps*2];
+	w3 += pD[3+ps*2] * pD[0+ps*2];
+	//
+	pD[0+ps*3] *= tmp;
+	w1 += pD[1+ps*3] * pD[0+ps*3];
+	w2 += pD[2+ps*3] * pD[0+ps*3];
+	w3 += pD[3+ps*3] * pD[0+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[0+ps*ii] *= tmp;
+		w1 += pD[1+ps*ii] * pD[0+ps*ii];
+		w2 += pD[2+ps*ii] * pD[0+ps*ii];
+		w3 += pD[3+ps*ii] * pD[0+ps*ii];
+		}
+	//
+	w1 = - dD[0] * w1;
+	w2 = - dD[0] * w2;
+	w3 = - dD[0] * w3;
+	//
+	pD[1+ps*0] += w1;
+	pD[2+ps*0] += w2;
+	pD[3+ps*0] += w3;
+	//
+	pD[1+ps*1] += w1 * pD[0+ps*1];
+	pD[2+ps*1] += w2 * pD[0+ps*1];
+	pD[3+ps*1] += w3 * pD[0+ps*1];
+	//
+	pD[1+ps*2] += w1 * pD[0+ps*2];
+	pD[2+ps*2] += w2 * pD[0+ps*2];
+	pD[3+ps*2] += w3 * pD[0+ps*2];
+	beta = pD[1+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] += w1 * pD[0+ps*3];
+	pD[2+ps*3] += w2 * pD[0+ps*3];
+	pD[3+ps*3] += w3 * pD[0+ps*3];
+	beta += pD[1+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] += w1 * pD[0+ps*ii];
+		pD[2+ps*ii] += w2 * pD[0+ps*ii];
+		pD[3+ps*ii] += w3 * pD[0+ps*ii];
+		beta += pD[1+ps*ii] * pD[1+ps*ii];
+		}
+	// second column
+col2:
+	if(beta==0.0)
+		{
+		dD[1] = 0.0;
+		tmp = 0.0;
+		goto col3;
+		}
+	alpha = pD[1+ps*1];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[1] = (beta-alpha) / beta;
+	pT[1+ps*1] = - dD[1];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[1+ps*1] = beta;
+	w0 = pD[0+ps*1]; //
+	w2 = pD[2+ps*1];
+	w3 = pD[3+ps*1];
+	//
+	pD[1+ps*2] *= tmp;
+	w0 += pD[0+ps*2] * pD[1+ps*2]; //
+	w2 += pD[2+ps*2] * pD[1+ps*2];
+	w3 += pD[3+ps*2] * pD[1+ps*2];
+	//
+	pD[1+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[1+ps*3]; //
+	w2 += pD[2+ps*3] * pD[1+ps*3];
+	w3 += pD[3+ps*3] * pD[1+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[1+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+		w2 += pD[2+ps*ii] * pD[1+ps*ii];
+		w3 += pD[3+ps*ii] * pD[1+ps*ii];
+		}
+	//
+	pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+	w2 = - dD[1] * w2;
+	w3 = - dD[1] * w3;
+	//
+	pD[2+ps*1] += w2;
+	pD[3+ps*1] += w3;
+	//
+	pD[2+ps*2] += w2 * pD[1+ps*2];
+	pD[3+ps*2] += w3 * pD[1+ps*2];
+	//
+	pD[2+ps*3] += w2 * pD[1+ps*3];
+	pD[3+ps*3] += w3 * pD[1+ps*3];
+	beta = pD[2+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] += w2 * pD[1+ps*ii];
+		pD[3+ps*ii] += w3 * pD[1+ps*ii];
+		beta += pD[2+ps*ii] * pD[2+ps*ii];
+		}
+	// third column
+col3:
+	if(beta==0.0)
+		{
+		dD[2] = 0.0;
+		tmp = 0.0;
+		goto col4;
+		}
+	alpha = pD[2+ps*2];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[2] = (beta-alpha) / beta;
+	pT[2+ps*2] = - dD[2];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[2+ps*2] = beta;
+	w0 = pD[0+ps*2];
+	w1 = pD[1+ps*2];
+	w3 = pD[3+ps*2];
+	//
+	pD[2+ps*3] *= tmp;
+	w0 += pD[0+ps*3] * pD[2+ps*3];
+	w1 += pD[1+ps*3] * pD[2+ps*3];
+	w3 += pD[3+ps*3] * pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[2+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[2+ps*ii];
+		w1 += pD[1+ps*ii] * pD[2+ps*ii];
+		w3 += pD[3+ps*ii] * pD[2+ps*ii];
+		}
+	//
+	pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+	pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+	w3 = - dD[2] * w3;
+	//
+	pD[3+ps*2] += w3;
+	//
+	pD[3+ps*3] += w3 * pD[2+ps*3];
+	//
+	beta = 0.0;
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] += w3 * pD[2+ps*ii];
+		beta += pD[3+ps*ii] * pD[3+ps*ii];
+		}
+	// fourth column
+col4:
+	if(beta==0.0)
+		{
+		dD[3] = 0.0;
+		tmp = 0.0;
+		return;
+		}
+	alpha = pD[3+ps*3];
+	beta += alpha*alpha;
+	beta = sqrt(beta);
+	if(alpha>0)
+		beta = -beta;
+	dD[3] = (beta-alpha) / beta;
+	pT[3+ps*3] = - dD[3];
+	tmp = 1.0 / (alpha-beta);
+	//
+	pD[3+ps*3] = beta;
+	w0 =  pD[0+ps*3];
+	w1 =  pD[1+ps*3];
+	w2 =  pD[2+ps*3];
+	//
+	for(ii=4; ii<n; ii++)
+		{
+		pD[3+ps*ii] *= tmp;
+		w0 += pD[0+ps*ii] * pD[3+ps*ii];
+		w1 += pD[1+ps*ii] * pD[3+ps*ii];
+		w2 += pD[2+ps*ii] * pD[3+ps*ii];
+		}
+	//
+	pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+	pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+	pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+	return;
+	}
+
+
+
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	pW[1+ps*0] = pD[1+ps*0];
+	pW[2+ps*0] = pD[2+ps*0];
+	pW[3+ps*0] = pD[3+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
+	pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
+	pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	pW[1+ps*1] = pD[1+ps*1];
+	pW[2+ps*1] = pD[2+ps*1];
+	pW[3+ps*1] = pD[3+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
+	pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
+	pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
+	pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
+	pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	pW[1+ps*2] = pD[1+ps*2];
+	pW[2+ps*2] = pD[2+ps*2];
+	pW[3+ps*2] = pD[3+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
+	pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
+	pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
+	pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
+	pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
+	pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
+	pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	pW[1+ps*3] = pD[1+ps*3];
+	pW[2+ps*3] = pD[2+ps*3];
+	pW[3+ps*3] = pD[3+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
+		pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
+		pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
+		pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
+		pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
+		pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
+		pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
+		pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
+		pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
+	pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
+	pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
+	pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
+	pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
+	pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
+	pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
+	pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
+	pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	pD[1+ps*0] += pW[1+ps*0];
+	pD[2+ps*0] += pW[2+ps*0];
+	pD[3+ps*0] += pW[3+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
+	pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
+	pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
+	pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
+	pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
+	pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
+	pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
+		pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
+		pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+	{
+	const int ps = 4;
+	double pW[16];
+	int kk;
+	// 0
+	pW[0+ps*0] = pD[0+ps*0];
+	// 1
+	pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+	pW[0+ps*1] = pD[0+ps*1];
+	// 2
+	pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+	pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+	pW[0+ps*2] = pD[0+ps*2];
+	// 3
+	pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+	pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+	pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+	pW[0+ps*3] = pD[0+ps*3];
+	//
+	for(kk=4; kk<kmax; kk++)
+		{
+		pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+		pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+		pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+		pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+		}
+	//
+	pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+	//
+	pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+	//
+	pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+	//
+	pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+	//
+	pD[0+ps*0] += pW[0+ps*0];
+	//
+	pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+	//
+	pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+	//
+	pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+	for(kk=4; kk<kmax; kk++)
+		{
+		pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+		}
+	return;
+	}
diff --git a/kernel/c99/kernel_dgetrf_pivot_4_lib4.c b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..787322e
--- /dev/null
+++ b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,779 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numbering, starting from 0
+void didamax_lib4(int n, int offset, double *pA, int sda, int *p_idamax, double *p_amax)
+	{
+
+	int idamax, ii;
+	double tmp, amax;
+		
+	p_idamax[0] = -1;
+	if(n<1)
+		return;
+
+	const int bs = 4;
+
+	int na = (bs - offset%bs)%bs;
+	na = n<na ? n : na;
+
+	amax = -1.0;
+	ii = 0;
+	if(na>0)
+		{
+		for( ; ii<na; ii++)
+			{
+			tmp = fabs(pA[0]);
+			if(tmp>amax)
+				{
+				idamax = ii+0;
+				amax = tmp;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		tmp = fabs(pA[1]);
+		if(tmp>amax)
+			{
+			idamax = ii+1;
+			amax = tmp;
+			}
+		tmp = fabs(pA[2]);
+		if(tmp>amax)
+			{
+			idamax = ii+2;
+			amax = tmp;
+			}
+		tmp = fabs(pA[3]);
+		if(tmp>amax)
+			{
+			idamax = ii+3;
+			amax = tmp;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<n; ii++)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		pA += 1;
+		}
+	
+	p_amax[0] = amax;
+	p_idamax[0] = idamax;
+
+	return;
+
+	}
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	double
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+	didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		pA[1+bs*0] *= tmp0;
+		pA[2+bs*0] *= tmp0;
+		pA[3+bs*0] *= tmp0;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB[1+bs*0] *= tmp0;
+			pB[2+bs*0] *= tmp0;
+			pB[3+bs*0] *= tmp0;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+
+	// second column
+	u_01  = pA[0+bs*1];
+	tmp1  = pA[1+bs*1];
+	tmp2  = pA[2+bs*1];
+	tmp3  = pA[3+bs*1];
+	tmp1 -= pA[1+bs*0] * u_01;
+	tmp2 -= pA[2+bs*0] * u_01;
+	tmp3 -= pA[3+bs*0] * u_01;
+	pA[1+bs*1] = tmp1;
+	pA[2+bs*1] = tmp2;
+	pA[3+bs*1] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*1];
+		tmp1  = pB[1+bs*1];
+		tmp2  = pB[2+bs*1];
+		tmp3  = pB[3+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		tmp1 -= pB[1+bs*0] * u_01;
+		tmp2 -= pB[2+bs*0] * u_01;
+		tmp3 -= pB[3+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB[1+bs*1] = tmp1;
+		pB[2+bs*1] = tmp2;
+		pB[3+bs*1] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0 = pB[0+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		pA[2+bs*1] *= tmp1;
+		pA[3+bs*1] *= tmp1;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB[1+bs*1] *= tmp1;
+			pB[2+bs*1] *= tmp1;
+			pB[3+bs*1] *= tmp1;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	// third column
+	u_02  = pA[0+bs*2];
+	u_12  = pA[1+bs*2];
+	u_12 -= pA[1+bs*0] * u_02;
+	pA[1+bs*2] = u_12;
+	tmp2  = pA[2+bs*2];
+	tmp3  = pA[3+bs*2];
+	tmp2 -= pA[2+bs*0] * u_02;
+	tmp3 -= pA[3+bs*0] * u_02;
+	tmp2 -= pA[2+bs*1] * u_12;
+	tmp3 -= pA[3+bs*1] * u_12;
+	pA[2+bs*2] = tmp2;
+	pA[3+bs*2] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp1  = pB[1+bs*2];
+		tmp2  = pB[2+bs*2];
+		tmp3  = pB[3+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp1 -= pB[1+bs*0] * u_02;
+		tmp2 -= pB[2+bs*0] * u_02;
+		tmp3 -= pB[3+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		tmp1 -= pB[1+bs*1] * u_12;
+		tmp2 -= pB[2+bs*1] * u_12;
+		tmp3 -= pB[3+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB[1+bs*2] = tmp1;
+		pB[2+bs*2] = tmp2;
+		pB[3+bs*2] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+	ipiv[2] = idamax+2;
+	if(tmp2!=0)
+		{
+		if(ipiv[2]!=2)
+			drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		tmp2 = 1.0 / pA[2+bs*2];
+		inv_diag_A[2] = tmp2;
+		pA[3+bs*2] *= tmp2;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB[1+bs*2] *= tmp2;
+			pB[2+bs*2] *= tmp2;
+			pB[3+bs*2] *= tmp2;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[2] = 0.0;
+		}
+
+	// fourth column
+	u_03  = pA[0+bs*3];
+	u_13  = pA[1+bs*3];
+	u_13 -= pA[1+bs*0] * u_03;
+	pA[1+bs*3] = u_13;
+	u_23  = pA[2+bs*3];
+	u_23 -= pA[2+bs*0] * u_03;
+	u_23 -= pA[2+bs*1] * u_13;
+	pA[2+bs*3] = u_23;
+	tmp3  = pA[3+bs*3];
+	tmp3 -= pA[3+bs*0] * u_03;
+	tmp3 -= pA[3+bs*1] * u_13;
+	tmp3 -= pA[3+bs*2] * u_23;
+	pA[3+bs*3] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp1  = pB[1+bs*3];
+		tmp2  = pB[2+bs*3];
+		tmp3  = pB[3+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp1 -= pB[1+bs*0] * u_03;
+		tmp2 -= pB[2+bs*0] * u_03;
+		tmp3 -= pB[3+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp1 -= pB[1+bs*1] * u_13;
+		tmp2 -= pB[2+bs*1] * u_13;
+		tmp3 -= pB[3+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		tmp1 -= pB[1+bs*2] * u_23;
+		tmp2 -= pB[2+bs*2] * u_23;
+		tmp3 -= pB[3+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB[1+bs*3] = tmp1;
+		pB[2+bs*3] = tmp2;
+		pB[3+bs*3] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB += 1;
+		}
+
+	didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+	ipiv[3] = idamax+3;
+	if(tmp3!=0)
+		{
+		if(ipiv[3]!=3)
+			drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		tmp3 = 1.0 / pA[3+bs*3];
+		inv_diag_A[3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB[1+bs*3] *= tmp3;
+			pB[2+bs*3] *= tmp3;
+			pB[3+bs*3] *= tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[3] = 0.0;
+		}
+	
+	return;
+
+	}
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	double
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	double
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+
+	// find pivot & scale
+	didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		if(m>=4)
+			{
+			pA[1+bs*0] *= tmp0;
+			pA[2+bs*0] *= tmp0;
+			pA[3+bs*0] *= tmp0;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB[1+bs*0] *= tmp0;
+				pB[2+bs*0] *= tmp0;
+				pB[3+bs*0] *= tmp0;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB += 1;
+				}
+			}
+		else // m = {1,2,3}
+			{
+			if(m>1)
+				{
+				pA[1+bs*0] *= tmp0;
+				if(m>2)
+					pA[2+bs*0] *= tmp0;
+				}
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+		return;
+
+	// second column
+
+	// correct
+	if(m>=4)
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp2  = pA[2+bs*1];
+		tmp3  = pA[3+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		tmp2 -= pA[2+bs*0] * u_01;
+		tmp3 -= pA[3+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		pA[2+bs*1] = tmp2;
+		pA[3+bs*1] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*1];
+			tmp1  = pB[1+bs*1];
+			tmp2  = pB[2+bs*1];
+			tmp3  = pB[3+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			tmp1 -= pB[1+bs*0] * u_01;
+			tmp2 -= pB[2+bs*0] * u_01;
+			tmp3 -= pB[3+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB[1+bs*1] = tmp1;
+			pB[2+bs*1] = tmp2;
+			pB[3+bs*1] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0 = pB[0+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*1];
+			tmp2 -= pA[2+bs*0] * u_01;
+			pA[2+bs*1] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		if(m>=4)
+			{
+			pA[2+bs*1] *= tmp1;
+			pA[3+bs*1] *= tmp1;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB[1+bs*1] *= tmp1;
+				pB[2+bs*1] *= tmp1;
+				pB[3+bs*1] *= tmp1;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB += 1;
+				}
+			}
+		else // m = {2,3}
+			{
+			if(m>2)
+				pA[2+bs*1] *= tmp1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	if(n==2)
+		return;
+
+	// third column
+
+	// correct
+	if(m>=4)
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		tmp2  = pA[2+bs*2];
+		tmp3  = pA[3+bs*2];
+		tmp2 -= pA[2+bs*0] * u_02;
+		tmp3 -= pA[3+bs*0] * u_02;
+		tmp2 -= pA[2+bs*1] * u_12;
+		tmp3 -= pA[3+bs*1] * u_12;
+		pA[2+bs*2] = tmp2;
+		pA[3+bs*2] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp1  = pB[1+bs*2];
+			tmp2  = pB[2+bs*2];
+			tmp3  = pB[3+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp1 -= pB[1+bs*0] * u_02;
+			tmp2 -= pB[2+bs*0] * u_02;
+			tmp3 -= pB[3+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			tmp1 -= pB[1+bs*1] * u_12;
+			tmp2 -= pB[2+bs*1] * u_12;
+			tmp3 -= pB[3+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB[1+bs*2] = tmp1;
+			pB[2+bs*2] = tmp2;
+			pB[3+bs*2] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*2];
+			tmp2 -= pA[2+bs*0] * u_02;
+			tmp2 -= pA[2+bs*1] * u_12;
+			pA[2+bs*2] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	if(m>2)
+		{
+		didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+		ipiv[2] = idamax+2;
+		if(tmp2!=0)
+			{
+			if(ipiv[2]!=2)
+				drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			tmp2 = 1.0 / pA[2+bs*2];
+			inv_diag_A[2] = tmp2;
+			if(m>=4)
+				{
+				pA[3+bs*2] *= tmp2;
+				pB = pA + bs*sda;
+				for(k=0; k<ma-3; k+=4)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB[1+bs*2] *= tmp2;
+					pB[2+bs*2] *= tmp2;
+					pB[3+bs*2] *= tmp2;
+					pB += bs*sda;
+					}
+				for( ; k<ma; k++)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB += 1;
+					}
+				}
+			}
+		else
+			{
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n<4)
+		return;
+
+	// fourth column
+
+	// correct
+	if(m>=4)
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		u_23  = pA[2+bs*3];
+		u_23 -= pA[2+bs*0] * u_03;
+		u_23 -= pA[2+bs*1] * u_13;
+		pA[2+bs*3] = u_23;
+		tmp3  = pA[3+bs*3];
+		tmp3 -= pA[3+bs*0] * u_03;
+		tmp3 -= pA[3+bs*1] * u_13;
+		tmp3 -= pA[3+bs*2] * u_23;
+		pA[3+bs*3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp1  = pB[1+bs*3];
+			tmp2  = pB[2+bs*3];
+			tmp3  = pB[3+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp1 -= pB[1+bs*0] * u_03;
+			tmp2 -= pB[2+bs*0] * u_03;
+			tmp3 -= pB[3+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp1 -= pB[1+bs*1] * u_13;
+			tmp2 -= pB[2+bs*1] * u_13;
+			tmp3 -= pB[3+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			tmp1 -= pB[1+bs*2] * u_23;
+			tmp2 -= pB[2+bs*2] * u_23;
+			tmp3 -= pB[3+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB[1+bs*3] = tmp1;
+			pB[2+bs*3] = tmp2;
+			pB[3+bs*3] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		if(m>2)
+			{
+			u_23  = pA[2+bs*3];
+			u_23 -= pA[2+bs*0] * u_03;
+			u_23 -= pA[2+bs*1] * u_13;
+			pA[2+bs*3] = u_23;
+			}
+		}
+
+	if(m>3)
+		{
+		// find pivot & scale
+		didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+		ipiv[3] = idamax+3;
+		if(tmp3!=0)
+			{
+			if(ipiv[3]!=3)
+				drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			tmp3 = 1.0 / pA[3+bs*3];
+			inv_diag_A[3] = tmp3;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB[1+bs*3] *= tmp3;
+				pB[2+bs*3] *= tmp3;
+				pB[3+bs*3] *= tmp3;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB += 1;
+				}
+			}
+		else
+			{
+			inv_diag_A[3] = 0.0;
+			}
+		}
+	
+	return;
+
+	}
+
+
+	
+
+
diff --git a/kernel/c99/kernel_dsymv_4_lib4.c b/kernel/c99/kernel_dsymv_4_lib4.c
new file mode 100644
index 0000000..bed4300
--- /dev/null
+++ b/kernel/c99/kernel_dsymv_4_lib4.c
@@ -0,0 +1,1024 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	double
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha_n[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha_n[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha_n[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha_n[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	// store t
+	z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+	if(km>1)
+		{
+		z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+		if(km>2)
+			{
+			z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+			if(km>3)
+				{
+				z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
+	{
+
+	kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	double *x_t = x_n;
+	double *z_t = z_n;
+
+	const int bs = 4;
+
+	int k;
+
+	double
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	if(offA==0)
+		{
+		if(kmax<4)
+			{
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+			if(kmax==1)
+				goto store_t;
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+			if(kmax==2)
+				goto store_t;
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+			goto store_t;
+			}
+		else
+			{
+
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+
+			// 3
+
+			y_n_0 = z_n[3]; 
+			x_t_0 = x_t[3];
+
+			a_00 = A[3+bs*0];
+			a_01 = A[3+bs*1];
+			a_02 = A[3+bs*2];
+			a_03 = A[3+bs*3];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_n_0 += a_02 * x_n_2;
+			y_t_2 += a_02 * x_t_0;
+			y_t_3 += a_03 * x_t_0;
+
+			z_n[3] = y_n_0;
+
+			k += 4;
+			A += sda*bs;
+			z_n += 4;
+			x_t += 4;
+
+			}
+		}
+	else if(offA==1)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==6)
+			goto store_t;
+
+		// 6
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==7)
+			goto store_t;
+
+		k += 7;
+
+		}
+	else if(offA==2)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==6)
+			goto store_t;
+
+		k += 6;
+
+		}
+	else // if(offA==3)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==5)
+			goto store_t;
+
+		k += 5;
+
+		}
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	store_t:
+	z_t[0] += alpha[0]*y_t_0;
+	if(km>1)
+		{
+		z_t[1] += alpha[0]*y_t_1;
+		if(km>2)
+			{
+			z_t[2] += alpha[0]*y_t_2;
+			if(km>3)
+				{
+				z_t[3] += alpha[0]*y_t_3;
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
+	{
+
+	kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+
diff --git a/kernel/c99/kernel_sgecp_lib4.c b/kernel/c99/kernel_sgecp_lib4.c
new file mode 100644
index 0000000..de5b704
--- /dev/null
+++ b/kernel/c99/kernel_sgecp_lib4.c
@@ -0,0 +1,1148 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+		A[2+bs*0] *= alpha;
+		A[3+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+		A[2+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+		A[1+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		A[0+bs*0] *= alpha;
+
+		A += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+		B[3+bs*0] = A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[1+bs*0];
+		B[1+bs*0] = A0[2+bs*0];
+		B[2+bs*0] = A0[3+bs*0];
+		B[3+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+		B[3+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+		B[3+bs*0] = A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+		B[3+bs*0] = A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+	B[2+bs*0] = A[2+bs*0];
+	B[3+bs*0] = A[3+bs*0];
+
+	B[2+bs*1] = A[2+bs*1];
+	B[3+bs*1] = A[3+bs*1];
+
+	B[3+bs*2] = A[3+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[1+bs*0];
+		B[1+bs*0] = A0[2+bs*0];
+		B[2+bs*0] = A0[3+bs*0];
+		B[3+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+0*bs] = A0[2+0*bs];
+	B[2+0*bs] = A0[3+0*bs];
+	B[3+0*bs] = A1[0+0*bs];
+
+	B[2+1*bs] = A0[3+1*bs];
+	B[3+1*bs] = A1[0+1*bs];
+
+	B[3+2*bs] = A1[0+2*bs];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+		B[3+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle}
+
+	B[1+bs*0] = A0[3+bs*0];
+	B[2+bs*0] = A1[0+bs*0];
+	B[3+bs*0] = A1[1+bs*0];
+
+	B[2+bs*1] = A1[0+bs*1];
+	B[3+bs*1] = A1[1+bs*1];
+
+	B[3+bs*2] = A1[1+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 4-wide + end 3x3 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+		B[3+bs*0] = A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 3x3 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+	B[2+bs*0] = A1[1+bs*0];
+	B[3+bs*0] = A1[2+bs*0];
+
+	B[2+bs*1] = A1[1+bs*1];
+	B[3+bs*1] = A1[2+bs*1];
+
+	B[3+bs*2] = A1[2+bs*2];
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+		B[2+bs*0] = A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+	B[2+bs*0] = A[2+bs*0];
+
+	B[2+bs*1] = A[2+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[2+bs*0];
+		B[1+bs*0] = A0[3+bs*0];
+		B[2+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A0[3+bs*0];
+	B[2+bs*0] = A1[0+bs*0];
+
+	B[2+bs*1] = A1[0+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 3-wide + end 2x2 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+		B[2+bs*0] = A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 2x2 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+	B[2+bs*0] = A1[1+bs*0];
+
+	B[2+bs*1] = A1[1+bs*1];
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 2-wide + end 1x1 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+		B[1+bs*0] = A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	// 1x1 triangle
+
+	B[1+bs*0] = A[1+bs*0];
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 2-wide + end 1x1 triangle
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A0[3+bs*0];
+		B[1+bs*0] = A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+	
+	// 1x1 triangle
+
+	B[1+bs*0] = A1[0+bs*0];
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
+	{
+
+	// A and C are lower triangular
+	// kmax+1 1-wide
+
+	kmax += 1;
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] = A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+		B[3+bs*0] += alpha * A[3+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[1+bs*0];
+		B[1+bs*0] += alpha * A0[2+bs*0];
+		B[2+bs*0] += alpha * A0[3+bs*0];
+		B[3+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+		B[3+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+		B[3+bs*0] += alpha * A1[2+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+		B[2+bs*0] += alpha * A[2+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[2+bs*0];
+		B[1+bs*0] += alpha * A0[3+bs*0];
+		B[2+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+		B[2+bs*0] += alpha * A1[1+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+		B[1+bs*0] += alpha * A[1+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	float *A1 = A0 + bs*sda;
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A0[3+bs*0];
+		B[1+bs*0] += alpha * A1[0+bs*0];
+
+		A0 += 4;
+		A1 += 4;
+		B  += 4;
+
+		}
+
+	}
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
+	{
+
+	if(kmax<=0)
+		return;
+
+	const int bs = 4;
+
+	float alpha = alphap[0];
+
+	int k;
+
+	for(k=0; k<kmax; k++)
+		{
+
+		B[0+bs*0] += alpha * A[0+bs*0];
+
+		A += 4;
+		B += 4;
+
+		}
+
+	}
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgemm_4x4_lib4.c b/kernel/c99/kernel_sgemm_4x4_lib4.c
new file mode 100644
index 0000000..243d559
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_4x4_lib4.c
@@ -0,0 +1,6094 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	float
+		*C1, *D1;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	if(offsetC==0)
+		{
+		c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==1)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+		}
+	else if(offsetC==2)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+		}
+	else //if(offsetC==3)
+		{
+		C1 = C0 + sdc*bs;
+
+		c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+		c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+		c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+		c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+		c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+		c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+		c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+		c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+		c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+		c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+		c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+		c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+		c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+		c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+		c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+		c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+		}
+	
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
+void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
+	{
+	kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+//		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+//		c_02 += a_0 * b_2;
+//		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+//		c_03 += a_0 * b_3;
+//		c_13 += a_1 * b_3;
+//		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+//	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+//	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+//	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+//	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+//	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+//	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	k = 0;
+
+	// k = 0
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 1
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	// k = 2
+	if(kmax>0)
+		{
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4;
+		k++;
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+	c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+	c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+	c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+	c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+	c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+	c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+	c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+	c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+	c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+	c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+	c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+	c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+	c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+	c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+	c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	float *D1;
+	
+	int k;
+
+	B += offsetB;
+
+	k = 0;
+
+	if(offsetB==0)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==1)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else if(offsetB==2)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 5
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+	else // if(offetB==3)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 1
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 2
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 3
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+		k += 1;
+
+		if(k>=kmax)
+			goto store;
+
+		// k = 4
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		b_1 = B[4];
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		b_2 = B[8];
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		b_3 = B[12];
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 4*sdb-3;
+		k += 1;
+
+		}
+
+	for(; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[1];
+		b_1 = B[5];
+		b_2 = B[9];
+		b_3 = B[13];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[2];
+		b_1 = B[6];
+		b_2 = B[10];
+		b_3 = B[14];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[3];
+		b_1 = B[7];
+		b_2 = B[11];
+		b_3 = B[15];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 16;
+		B += 4*sdb;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[4];
+		b_2 = B[8];
+		b_3 = B[12];
+
+		c_00 += a_0 * b_0;
+		c_10 += a_1 * b_0;
+		c_20 += a_2 * b_0;
+		c_30 += a_3 * b_0;
+
+		c_01 += a_0 * b_1;
+		c_11 += a_1 * b_1;
+		c_21 += a_2 * b_1;
+		c_31 += a_3 * b_1;
+
+		c_02 += a_0 * b_2;
+		c_12 += a_1 * b_2;
+		c_22 += a_2 * b_2;
+		c_32 += a_3 * b_2;
+
+		c_03 += a_0 * b_3;
+		c_13 += a_1 * b_3;
+		c_23 += a_2 * b_3;
+		c_33 += a_3 * b_3;
+
+		A += 4;
+		B += 1;
+
+		}
+	
+	store:
+	
+	c_00 = alpha[0]*c_00;
+	c_10 = alpha[0]*c_10;
+	c_20 = alpha[0]*c_20;
+	c_30 = alpha[0]*c_30;
+
+	c_01 = alpha[0]*c_01;
+	c_11 = alpha[0]*c_11;
+	c_21 = alpha[0]*c_21;
+	c_31 = alpha[0]*c_31;
+
+	c_02 = alpha[0]*c_02;
+	c_12 = alpha[0]*c_12;
+	c_22 = alpha[0]*c_22;
+	c_32 = alpha[0]*c_32;
+
+	c_03 = alpha[0]*c_03;
+	c_13 = alpha[0]*c_13;
+	c_23 = alpha[0]*c_23;
+	c_33 = alpha[0]*c_33;
+
+	// shift sol for cols
+	if(n0>0)
+		{
+		if(n0==1)
+			{
+			c_00 = c_01;
+			c_10 = c_11;
+			c_20 = c_21;
+			c_30 = c_31;
+
+			c_01 = c_02;
+			c_11 = c_12;
+			c_21 = c_22;
+			c_31 = c_32;
+
+			c_02 = c_03;
+			c_12 = c_13;
+			c_22 = c_23;
+			c_32 = c_33;
+
+			D0 += 1*bs;
+			}
+		else if(n0==2)
+			{
+			c_00 = c_02;
+			c_10 = c_12;
+			c_20 = c_22;
+			c_30 = c_32;
+
+			c_01 = c_03;
+			c_11 = c_13;
+			c_21 = c_23;
+			c_31 = c_33;
+
+			D0 += 2*bs;
+			}
+		else //if(n0==3)
+			{
+			c_00 = c_03;
+			c_10 = c_13;
+			c_20 = c_23;
+			c_30 = c_33;
+
+			D0 += 3*bs;
+			}
+		}
+
+	int kn = n1 - n0;
+
+	if(offsetD==0)
+		{
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+		if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+		if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+		if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+		if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+		}
+	else if(offsetD==1)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+		if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+		if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+		if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+		if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+		}
+	else if(offsetD==2)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+		if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+		if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+		if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+		if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+		}
+	else //if(offsetD==3)
+		{
+		D1 = D0 + sdd*bs;
+
+		if(kn<=0)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+		if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+		if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+		if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+		if(kn<=1)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+		if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+		if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+		if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+		if(kn<=2)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+		if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+		if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+		if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+		if(kn<=3)
+			return;
+
+		if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+		if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+		if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+		if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
+	{
+	kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, //c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, //c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, //c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+//		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+//		c_02 -= a_0 * b_2;
+//		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+//		c_03 -= a_0 * b_3;
+//		c_13 -= a_1 * b_3;
+//		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+//	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+//	c_02 = C[0+bs*2] + c_02;
+//	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+//	c_03 = C[0+bs*3] + c_03;
+//	c_13 = C[1+bs*3] + c_13;
+//	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(c_00>0)
+		{
+		c_00 = sqrt(c_00);
+		tmp = 1.0/c_00;
+		}
+	else
+		{
+		c_00 = 0.0;
+		tmp = 0.0;
+		}
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+	
+	c_11 -= c_10 * c_10;
+	c_21 -= c_20 * c_10;
+	c_31 -= c_30 * c_10;
+	if(c_11>0)
+		{
+		c_11 = sqrt(c_11);
+		tmp = 1.0/c_11;
+		}
+	else
+		{
+		c_11 = 0.0;
+		tmp = 0.0;
+		}
+	c_21 *= tmp;
+	c_31 *= tmp;
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+	
+	c_22 -= c_20 * c_20;
+	c_32 -= c_30 * c_20;
+	c_22 -= c_21 * c_21;
+	c_32 -= c_31 * c_21;
+	if(c_22>0)
+		{
+		c_22 = sqrt(c_22);
+		tmp = 1.0/c_22;
+		}
+	else
+		{
+		c_22 = 0.0;
+		tmp = 0.0;
+		}
+	c_32 *= tmp;
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+	
+	c_33 -= c_30 * c_30;
+	c_33 -= c_31 * c_31;
+	c_33 -= c_32 * c_32;
+	if(c_33>0)
+		{
+		c_33 = sqrt(c_33);
+		tmp = 1.0/c_33;
+		}
+	else
+		{
+		c_33 = 0.0;
+		tmp = 0.0;
+		}
+	inv_diag_D[3] = tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+//		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+//		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+//		D[1+bs*2] = c_12;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+//		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+//		if(kn==1)
+//			return;
+
+//		D[0+bs*1] = c_01;
+
+//		if(kn==2)
+//			return;
+
+//		D[0+bs*2] = c_02;
+
+//		if(kn==3)
+//			return;
+
+//		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
+	{
+	kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+	float alpha = 1.0;
+	float beta = 1.0;
+	kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
+	{
+	float alpha = 1.0;
+	float beta = 1.0;
+	kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+	tmp = inv_diag_E[1];
+	c_01 *= tmp;
+	c_11 *= tmp;
+	c_21 *= tmp;
+	c_31 *= tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+	tmp = inv_diag_E[2];
+	c_02 *= tmp;
+	c_12 *= tmp;
+	c_22 *= tmp;
+	c_32 *= tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+	tmp = inv_diag_E[3];
+	c_03 *= tmp;
+	c_13 *= tmp;
+	c_23 *= tmp;
+	c_33 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+	float alpha = 1.0;
+	float beta  = 1.0;
+	kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+	kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	float alpha = 1.0;
+	float beta  = 1.0;
+	kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+	kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+	if(kn==1)
+		goto store;
+	
+	tmp = E[1+bs*0];
+	c_01 -= c_00 * tmp;
+	c_11 -= c_10 * tmp;
+	c_21 -= c_20 * tmp;
+	c_31 -= c_30 * tmp;
+
+	if(kn==2)
+		goto store;
+	
+	tmp = E[2+bs*0];
+	c_02 -= c_00 * tmp;
+	c_12 -= c_10 * tmp;
+	c_22 -= c_20 * tmp;
+	c_32 -= c_30 * tmp;
+	tmp = E[2+bs*1];
+	c_02 -= c_01 * tmp;
+	c_12 -= c_11 * tmp;
+	c_22 -= c_21 * tmp;
+	c_32 -= c_31 * tmp;
+
+	if(kn==3)
+		goto store;
+	
+	tmp = E[3+bs*0];
+	c_03 -= c_00 * tmp;
+	c_13 -= c_10 * tmp;
+	c_23 -= c_20 * tmp;
+	c_33 -= c_30 * tmp;
+	tmp = E[3+bs*1];
+	c_03 -= c_01 * tmp;
+	c_13 -= c_11 * tmp;
+	c_23 -= c_21 * tmp;
+	c_33 -= c_31 * tmp;
+	tmp = E[3+bs*2];
+	c_03 -= c_02 * tmp;
+	c_13 -= c_12 * tmp;
+	c_23 -= c_22 * tmp;
+	c_33 -= c_32 * tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
+	{
+	kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	float
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		tmp,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+	
+	int k;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 1
+
+		a_0 = A[4];
+		a_1 = A[5];
+		a_2 = A[6];
+		a_3 = A[7];
+
+		b_0 = B[4];
+		b_1 = B[5];
+		b_2 = B[6];
+		b_3 = B[7];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 2
+
+		a_0 = A[8];
+		a_1 = A[9];
+		a_2 = A[10];
+		a_3 = A[11];
+
+		b_0 = B[8];
+		b_1 = B[9];
+		b_2 = B[10];
+		b_3 = B[11];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		// k = 3
+
+		a_0 = A[12];
+		a_1 = A[13];
+		a_2 = A[14];
+		a_3 = A[15];
+
+		b_0 = B[12];
+		b_1 = B[13];
+		b_2 = B[14];
+		b_3 = B[15];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 16;
+		B += 16;
+
+		}
+	
+	for(; k<kmax; k++)
+		{
+
+		// k = 0
+
+		a_0 = A[0];
+		a_1 = A[1];
+		a_2 = A[2];
+		a_3 = A[3];
+
+		b_0 = B[0];
+		b_1 = B[1];
+		b_2 = B[2];
+		b_3 = B[3];
+
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+		A += 4;
+		B += 4;
+
+		}
+	
+	c_00 = C[0+bs*0] + c_00;
+	c_10 = C[1+bs*0] + c_10;
+	c_20 = C[2+bs*0] + c_20;
+	c_30 = C[3+bs*0] + c_30;
+
+	c_01 = C[0+bs*1] + c_01;
+	c_11 = C[1+bs*1] + c_11;
+	c_21 = C[2+bs*1] + c_21;
+	c_31 = C[3+bs*1] + c_31;
+
+	c_02 = C[0+bs*2] + c_02;
+	c_12 = C[1+bs*2] + c_12;
+	c_22 = C[2+bs*2] + c_22;
+	c_32 = C[3+bs*2] + c_32;
+
+	c_03 = C[0+bs*3] + c_03;
+	c_13 = C[1+bs*3] + c_13;
+	c_23 = C[2+bs*3] + c_23;
+	c_33 = C[3+bs*3] + c_33;
+
+
+	if(kn>3)
+		{
+		tmp = inv_diag_E[3];
+		c_03 *= tmp;
+		c_13 *= tmp;
+		c_23 *= tmp;
+		c_33 *= tmp;
+		tmp = E[2+bs*3];
+		c_02 -= c_03 * tmp;
+		c_12 -= c_13 * tmp;
+		c_22 -= c_23 * tmp;
+		c_32 -= c_33 * tmp;
+		tmp = E[1+bs*3];
+		c_01 -= c_03 * tmp;
+		c_11 -= c_13 * tmp;
+		c_21 -= c_23 * tmp;
+		c_31 -= c_33 * tmp;
+		tmp = E[0+bs*3];
+		c_00 -= c_03 * tmp;
+		c_10 -= c_13 * tmp;
+		c_20 -= c_23 * tmp;
+		c_30 -= c_33 * tmp;
+		}
+
+	if(kn>2)
+		{
+		tmp = inv_diag_E[2];
+		c_02 *= tmp;
+		c_12 *= tmp;
+		c_22 *= tmp;
+		c_32 *= tmp;
+		tmp = E[1+bs*2];
+		c_01 -= c_02 * tmp;
+		c_11 -= c_12 * tmp;
+		c_21 -= c_22 * tmp;
+		c_31 -= c_32 * tmp;
+		tmp = E[0+bs*2];
+		c_00 -= c_02 * tmp;
+		c_10 -= c_12 * tmp;
+		c_20 -= c_22 * tmp;
+		c_30 -= c_32 * tmp;
+		}
+
+	if(kn>1)
+		{
+		tmp = inv_diag_E[1];
+		c_01 *= tmp;
+		c_11 *= tmp;
+		c_21 *= tmp;
+		c_31 *= tmp;
+		tmp = E[0+bs*1];
+		c_00 -= c_01 * tmp;
+		c_10 -= c_11 * tmp;
+		c_20 -= c_21 * tmp;
+		c_30 -= c_31 * tmp;
+		}
+
+	tmp = inv_diag_E[0];
+	c_00 *= tmp;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// factorization
+
+	// first column
+	tmp = 1.0 / c_00;
+	c_10 *= tmp;
+	c_20 *= tmp;
+	c_30 *= tmp;
+
+	inv_diag_D[0] = tmp;
+
+	if(kn==1)
+		goto store;
+
+	// second column
+	c_11 -= c_10 * c_01;
+	c_21 -= c_20 * c_01;
+	c_31 -= c_30 * c_01;
+
+	tmp = 1.0 / c_11;
+	c_21 *= tmp;
+	c_31 *= tmp;
+	
+	inv_diag_D[1] = tmp;
+
+	if(kn==2)
+		goto store;
+
+	// third column
+	c_12 -= c_10 * c_02;
+	c_22 -= c_20 * c_02;
+	c_32 -= c_30 * c_02;
+
+	c_22 -= c_21 * c_12;
+	c_32 -= c_31 * c_12;
+
+	tmp = 1.0 / c_22;
+	c_32 *= tmp;
+
+	inv_diag_D[2] = tmp;
+
+	if(kn==3)
+		goto store;
+
+	// fourth column
+	c_13 -= c_10 * c_03;
+	c_23 -= c_20 * c_03;
+	c_33 -= c_30 * c_03;
+
+	c_23 -= c_21 * c_13;
+	c_33 -= c_31 * c_13;
+
+	c_33 -= c_32 * c_23;
+
+	tmp = 1.0 / c_33;
+
+	inv_diag_D[3] = tmp;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
+	{
+	kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_1, e_2, e_3,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+	// solution
+
+	if(km==1)
+		goto store;
+	
+	e_1 = E[1+bs*0];
+	e_2 = E[2+bs*0];
+	e_3 = E[3+bs*0];
+	c_10 -= e_1 * c_00;
+	c_20 -= e_2 * c_00;
+	c_30 -= e_3 * c_00;
+	c_11 -= e_1 * c_01;
+	c_21 -= e_2 * c_01;
+	c_31 -= e_3 * c_01;
+	c_12 -= e_1 * c_02;
+	c_22 -= e_2 * c_02;
+	c_32 -= e_3 * c_02;
+	c_13 -= e_1 * c_03;
+	c_23 -= e_2 * c_03;
+	c_33 -= e_3 * c_03;
+
+	if(km==2)
+		goto store;
+	
+	e_2 = E[2+bs*1];
+	e_3 = E[3+bs*1];
+	c_20 -= e_2 * c_10;
+	c_30 -= e_3 * c_10;
+	c_21 -= e_2 * c_11;
+	c_31 -= e_3 * c_11;
+	c_22 -= e_2 * c_12;
+	c_32 -= e_3 * c_12;
+	c_23 -= e_2 * c_13;
+	c_33 -= e_3 * c_13;
+
+	if(km==3)
+		goto store;
+	
+	e_3 = E[3+bs*2];
+	c_30 -= e_3 * c_20;
+	c_31 -= e_3 * c_21;
+	c_32 -= e_3 * c_22;
+	c_33 -= e_3 * c_23;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
+	{
+	kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+	
+	// solve
+
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_10 *= e_00;
+	c_20 *= e_00;
+	c_30 *= e_00;
+
+	if(kn==1)
+		goto store;
+	
+	e_01 = E[0+bs*1];
+	e_11 = inv_diag_E[1];
+	c_01 -= c_00 * e_01;
+	c_11 -= c_10 * e_01;
+	c_21 -= c_20 * e_01;
+	c_31 -= c_30 * e_01;
+	c_01 *= e_11;
+	c_11 *= e_11;
+	c_21 *= e_11;
+	c_31 *= e_11;
+
+	if(kn==2)
+		goto store;
+	
+	e_02 = E[0+bs*2];
+	e_12 = E[1+bs*2];
+	e_22 = inv_diag_E[2];
+	c_02 -= c_00 * e_02;
+	c_12 -= c_10 * e_02;
+	c_22 -= c_20 * e_02;
+	c_32 -= c_30 * e_02;
+	c_02 -= c_01 * e_12;
+	c_12 -= c_11 * e_12;
+	c_22 -= c_21 * e_12;
+	c_32 -= c_31 * e_12;
+	c_02 *= e_22;
+	c_12 *= e_22;
+	c_22 *= e_22;
+	c_32 *= e_22;
+
+	if(kn==3)
+		goto store;
+	
+	e_03 = E[0+bs*3];
+	e_13 = E[1+bs*3];
+	e_23 = E[2+bs*3];
+	e_33 = inv_diag_E[3];
+	c_03 -= c_00 * e_03;
+	c_13 -= c_10 * e_03;
+	c_23 -= c_20 * e_03;
+	c_33 -= c_30 * e_03;
+	c_03 -= c_01 * e_13;
+	c_13 -= c_11 * e_13;
+	c_23 -= c_21 * e_13;
+	c_33 -= c_31 * e_13;
+	c_03 -= c_02 * e_23;
+	c_13 -= c_12 * e_23;
+	c_23 -= c_22 * e_23;
+	c_33 -= c_32 * e_23;
+	c_03 *= e_33;
+	c_13 *= e_33;
+	c_23 *= e_33;
+	c_33 *= e_33;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		tmp,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		e_00, e_01, e_02, e_03,
+		      e_11, e_12, e_13,
+			        e_22, e_23,
+					      e_33,
+		c_00=0, c_01=0, c_02=0, c_03=0,
+		c_10=0, c_11=0, c_12=0, c_13=0,
+		c_20=0, c_21=0, c_22=0, c_23=0,
+		c_30=0, c_31=0, c_32=0, c_33=0;
+		
+	if(kmax<=0)
+		goto add;
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		b_0 = B[1+bs*0];
+		b_1 = B[1+bs*1];
+		b_2 = B[1+bs*2];
+		b_3 = B[1+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		b_0 = B[2+bs*0];
+		b_1 = B[2+bs*1];
+		b_2 = B[2+bs*2];
+		b_3 = B[2+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		b_0 = B[3+bs*0];
+		b_1 = B[3+bs*1];
+		b_2 = B[3+bs*2];
+		b_3 = B[3+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+		
+		
+		A += 16;
+		B += 4*sdb;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[0+bs*1];
+		b_2 = B[0+bs*2];
+		b_3 = B[0+bs*3];
+		
+		c_00 -= a_0 * b_0;
+		c_10 -= a_1 * b_0;
+		c_20 -= a_2 * b_0;
+		c_30 -= a_3 * b_0;
+
+		c_01 -= a_0 * b_1;
+		c_11 -= a_1 * b_1;
+		c_21 -= a_2 * b_1;
+		c_31 -= a_3 * b_1;
+
+		c_02 -= a_0 * b_2;
+		c_12 -= a_1 * b_2;
+		c_22 -= a_2 * b_2;
+		c_32 -= a_3 * b_2;
+
+		c_03 -= a_0 * b_3;
+		c_13 -= a_1 * b_3;
+		c_23 -= a_2 * b_3;
+		c_33 -= a_3 * b_3;
+
+
+		A += 4;
+		B += 1;
+
+		}
+		
+	add:
+
+	c_00 += C[0+bs*0];
+	c_10 += C[1+bs*0];
+	c_20 += C[2+bs*0];
+	c_30 += C[3+bs*0];
+
+	c_01 += C[0+bs*1];
+	c_11 += C[1+bs*1];
+	c_21 += C[2+bs*1];
+	c_31 += C[3+bs*1];
+
+	c_02 += C[0+bs*2];
+	c_12 += C[1+bs*2];
+	c_22 += C[2+bs*2];
+	c_32 += C[3+bs*2];
+
+	c_03 += C[0+bs*3];
+	c_13 += C[1+bs*3];
+	c_23 += C[2+bs*3];
+	c_33 += C[3+bs*3];
+
+//	printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+//	printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+//	printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+//	printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+	
+	// solve
+
+	if(km>3)
+		{
+		e_03 = E[0+bs*3];
+		e_13 = E[1+bs*3];
+		e_23 = E[2+bs*3];
+		e_33 = inv_diag_E[3];
+		c_30 *= e_33;
+		c_31 *= e_33;
+		c_32 *= e_33;
+		c_33 *= e_33;
+		c_00 -= e_03 * c_30;
+		c_01 -= e_03 * c_31;
+		c_02 -= e_03 * c_32;
+		c_03 -= e_03 * c_33;
+		c_10 -= e_13 * c_30;
+		c_11 -= e_13 * c_31;
+		c_12 -= e_13 * c_32;
+		c_13 -= e_13 * c_33;
+		c_20 -= e_23 * c_30;
+		c_21 -= e_23 * c_31;
+		c_22 -= e_23 * c_32;
+		c_23 -= e_23 * c_33;
+		}
+	
+	if(km>2)
+		{
+		e_02 = E[0+bs*2];
+		e_12 = E[1+bs*2];
+		e_22 = inv_diag_E[2];
+		c_20 *= e_22;
+		c_21 *= e_22;
+		c_22 *= e_22;
+		c_23 *= e_22;
+		c_00 -= e_02 * c_20;
+		c_01 -= e_02 * c_21;
+		c_02 -= e_02 * c_22;
+		c_03 -= e_02 * c_23;
+		c_10 -= e_12 * c_20;
+		c_11 -= e_12 * c_21;
+		c_12 -= e_12 * c_22;
+		c_13 -= e_12 * c_23;
+		}
+	
+	if(km>1)
+		{
+		e_01 = E[0+bs*1];
+		e_11 = inv_diag_E[1];
+		c_10 *= e_11;
+		c_11 *= e_11;
+		c_12 *= e_11;
+		c_13 *= e_11;
+		c_00 -= e_01 * c_10;
+		c_01 -= e_01 * c_11;
+		c_02 -= e_01 * c_12;
+		c_03 -= e_01 * c_13;
+		}
+	
+	e_00 = inv_diag_E[0];
+	c_00 *= e_00;
+	c_01 *= e_00;
+	c_02 *= e_00;
+	c_03 *= e_00;
+
+	store:
+
+	if(km>=4)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+		D[3+bs*0] = c_30;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+		D[3+bs*1] = c_31;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+		D[3+bs*2] = c_32;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		D[3+bs*3] = c_33;
+		}
+	else if(km>=3)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+		D[2+bs*0] = c_20;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+		D[2+bs*1] = c_21;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+		D[2+bs*2] = c_22;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		D[2+bs*3] = c_23;
+		}
+	else if(km>=2)
+		{
+		D[0+bs*0] = c_00;
+		D[1+bs*0] = c_10;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+		D[1+bs*1] = c_11;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+		D[1+bs*2] = c_12;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		D[1+bs*3] = c_13;
+		}
+	else //if(km>=1)
+		{
+		D[0+bs*0] = c_00;
+
+		if(kn==1)
+			return;
+
+		D[0+bs*1] = c_01;
+
+		if(kn==2)
+			return;
+
+		D[0+bs*2] = c_02;
+
+		if(kn==3)
+			return;
+
+		D[0+bs*3] = c_03;
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+	{
+	kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+	}
+#endif
+
diff --git a/kernel/c99/kernel_sgemm_diag_lib4.c b/kernel/c99/kernel_sgemm_diag_lib4.c
new file mode 100644
index 0000000..93df707
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_diag_lib4.c
@@ -0,0 +1,1112 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_0;
+		c_2 = a_2 * b_0;
+		c_3 = a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = a_0 * b_1;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_1;
+		c_3 = a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = a_0 * b_2;
+		c_1 = a_1 * b_2;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = a_0 * b_3;
+		c_1 = a_1 * b_3;
+		c_2 = a_2 * b_3;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+	
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	b_3 = alpha0 * B[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		a_0 = A[0+bs*3];
+		a_1 = A[1+bs*3];
+		a_2 = A[2+bs*3];
+		a_3 = A[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		a_0 = A[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+		D[0+bs*3] = c_0;
+
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+	b_2 = alpha0 * B[2];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		a_0 = A[0+bs*2];
+		a_1 = A[1+bs*2];
+		a_2 = A[2+bs*2];
+		a_3 = A[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		a_0 = A[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+		D[0+bs*2] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	b_1 = alpha0 * B[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		a_0 = A[0+bs*1];
+		a_1 = A[1+bs*1];
+		a_2 = A[2+bs*1];
+		a_3 = A[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		a_0 = A[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+		D[0+bs*1] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	b_0 = alpha0 * B[0];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		a_0 = A[0+bs*0];
+		a_1 = A[1+bs*0];
+		a_2 = A[2+bs*0];
+		a_3 = A[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		A += 4*sda;
+		C += 4*sdc;
+		D += 4*sdd;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		a_0 = A[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		A += 1;
+		C += 1;
+		D += 1;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = a_0 * b_0;
+		c_1 = a_1 * b_1;
+		c_2 = a_2 * b_2;
+		c_3 = a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int alg)
+	{
+
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2, a_3,
+		b_0, b_1, b_2, b_3,
+		c_0, c_1, c_2, c_3;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+	a_3 = alpha0 * A[3];
+	
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		b_3 = B[3+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		D[3+bs*1] = c_3;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		b_3 = B[3+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		D[3+bs*2] = c_3;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		b_3 = B[3+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+		D[3+bs*3] = c_3;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		b_3 = B[3+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		D[3+bs*0] = c_3;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1, a_2,
+		b_0, b_1, b_2,
+		c_0, c_1, c_2;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+	a_2 = alpha0 * A[2];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		b_2 = B[2+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		D[2+bs*1] = c_2;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		b_2 = B[2+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		D[2+bs*2] = c_2;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		b_2 = B[2+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+		D[2+bs*3] = c_2;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		b_2 = B[2+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		D[2+bs*0] = c_2;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0, a_1,
+		b_0, b_1,
+		c_0, c_1;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+	a_1 = alpha0 * A[1];
+
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+		
+
+		b_0 = B[0+bs*1];
+		b_1 = B[1+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+		D[0+bs*1] = c_0;
+		D[1+bs*1] = c_1;
+		
+
+		b_0 = B[0+bs*2];
+		b_1 = B[1+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+		D[0+bs*2] = c_0;
+		D[1+bs*2] = c_1;
+		
+
+		b_0 = B[0+bs*3];
+		b_1 = B[1+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+		D[0+bs*3] = c_0;
+		D[1+bs*3] = c_1;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		b_1 = B[1+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+		D[0+bs*0] = c_0;
+		D[1+bs*0] = c_1;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+	
+	}
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+	{
+	
+	if(kmax<=0)
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		alpha0, beta0,
+		a_0,
+		b_0,
+		c_0;
+		
+	alpha0 = alpha[0];
+	beta0  = beta[0];
+		
+	a_0 = alpha0 * A[0];
+		
+	for(k=0; k<kmax-3; k+=4)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+		
+
+		b_0 = B[0+bs*1];
+		
+		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+		D[0+bs*1] = c_0;
+		
+
+		b_0 = B[0+bs*2];
+		
+		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+		D[0+bs*2] = c_0;
+		
+
+		b_0 = B[0+bs*3];
+		
+		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+		D[0+bs*3] = c_0;
+
+		B += 16;
+		C += 16;
+		D += 16;
+		
+		}
+	for(; k<kmax; k++)
+		{
+		
+		b_0 = B[0+bs*0];
+		
+		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+		D[0+bs*0] = c_0;
+	
+		B += 4;
+		C += 4;
+		D += 4;
+		
+		}
+		
+	}
+#endif
+
+
+
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+	{
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		x_0,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		x_0 = x[1];
+
+		y_0 += A[0+bs*1] * x_0;
+		y_1 += A[1+bs*1] * x_0;
+		y_2 += A[2+bs*1] * x_0;
+		y_3 += A[3+bs*1] * x_0;
+		
+		x_0 = x[2];
+
+		y_0 += A[0+bs*2] * x_0;
+		y_1 += A[1+bs*2] * x_0;
+		y_2 += A[2+bs*2] * x_0;
+		y_3 += A[3+bs*2] * x_0;
+		
+		x_0 = x[3];
+
+		y_0 += A[0+bs*3] * x_0;
+		y_1 += A[1+bs*3] * x_0;
+		y_2 += A[2+bs*3] * x_0;
+		y_3 += A[3+bs*3] * x_0;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(k0<=0 & k1>3)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		if(k0<=0 & k1>0) z[0] = y_0;
+		if(k0<=1 & k1>1) z[1] = y_1;
+		if(k0<=2 & k1>2) z[2] = y_2;
+		if(k0<=3 & k1>3) z[3] = y_3;
+		}
+
+	}
+#endif
+	
+	
+	
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+	}
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int k, kend;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	if(offA!=0) // 1, 2, 3
+		{
+		kend = 4-offA<kmax ? 4-offA : kmax;
+		for(; k<kend; k++)
+			{
+			
+			x_0 = x[0];
+		
+			y_0 += A[0+bs*0] * x_0;
+			y_1 += A[0+bs*1] * x_0;
+			y_2 += A[0+bs*2] * x_0;
+			y_3 += A[0+bs*3] * x_0;
+		
+			A += 1;
+			x += 1;
+			
+			}
+		A += bs*(sda-1);
+		}
+	for(; k<kmax-bs+1; k+=bs)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+	
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+	
+		A += 1;
+		x += 1;
+		
+		}
+
+	y_0 = alpha[0]*y_0 + beta[0]*y[0];
+	y_1 = alpha[0]*y_1 + beta[0]*y[1];
+	y_2 = alpha[0]*y_2 + beta[0]*y[2];
+	y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+	{
+
+	kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+	}
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[1+bs*0] * x_0;
+		y_2 -= A[2+bs*0] * x_0;
+		y_3 -= A[3+bs*0] * x_0;
+
+		y_0 -= A[0+bs*1] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[2+bs*1] * x_1;
+		y_3 -= A[3+bs*1] * x_1;
+
+		y_0 -= A[0+bs*2] * x_2;
+		y_1 -= A[1+bs*2] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[3+bs*2] * x_2;
+
+		y_0 -= A[0+bs*3] * x_3;
+		y_1 -= A[1+bs*3] * x_3;
+		y_2 -= A[2+bs*3] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	float
+		a_00, a_10, a_20, a_30,
+		a_11, a_21, a_31;
+	
+	// a_00
+	a_00 = inv_diag_A[0];
+	a_10 = A[1+bs*0];
+	a_20 = A[2+bs*0];
+	a_30 = A[3+bs*0];
+	y_0 *= a_00;
+	z[0] = y_0;
+	y_1 -= a_10 * y_0;
+	y_2 -= a_20 * y_0;
+	y_3 -= a_30 * y_0;
+
+	if(kn==1)
+		{
+		if(km==1)
+			return;
+		y[1] = y_1;
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_11
+	a_11 = inv_diag_A[1];
+	a_21 = A[2+bs*1];
+	a_31 = A[3+bs*1];
+	y_1 *= a_11;	
+	z[1] = y_1;
+	y_2 -= a_21 * y_1;
+	y_3 -= a_31 * y_1;
+
+	if(kn==2)
+		{
+		if(km==2)
+			return;
+		y[2] = y_2;
+		if(km==3)
+			return;
+		y[3] = y_3;
+		return;
+		}
+
+	// a_22
+	a_00 = inv_diag_A[2];
+	a_10 = A[3+bs*2];
+	y_2 *= a_00;
+	z[2] = y_2;
+	y_3 -= a_10 * y_2;
+
+	if(kn==3)
+		{
+		if(km==3)
+			return;
+		y[3] = y_3;
+
+		return;
+		}
+
+	// a_33
+	a_11 = inv_diag_A[3];
+	y_3 *= a_11;	
+	z[3] = y_3;
+
+	}
+#endif
+	
+
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+	}
+#endif
+	
+	
+		
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=4;
+	A += 4 + (sda-1)*bs;
+	x += 4;
+	for(; k<kmax-3; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_1 -= A[1+bs*1] * x_1;
+		y_2 -= A[1+bs*2] * x_1;
+		y_3 -= A[1+bs*3] * x_1;
+		
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+		y_2 -= A[2+bs*2] * x_2;
+		y_3 -= A[2+bs*3] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+		y_3 -= A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		y_3 -= A[0+bs*3] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+	
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+	y_3 = y[3] + y_3;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_3 *= inv_diag_A[3];
+	z[3] = y_3;
+
+	y_2 -= A[3+bs*2] * y_3;
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+	y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0;
+	
+	k = 3;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_3 = x[3];
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+		y_2 -= A[3+bs*2] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+			y_2 -= A[0+bs*2] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			y_2 -= A[1+bs*2] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+			y_2 -= A[2+bs*2] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			y_2 -= A[3+bs*2] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 3;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		y_2 -= A[0+bs*2] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+	y_2 = y[2] + y_2;
+
+	A = tA;
+	x = tx;
+
+	// bottom trinagle
+	y_2 *= inv_diag_A[2];
+	z[2] = y_2;
+
+	// square
+	y_0 -= A[2+bs*0]*y_2;
+	y_1 -= A[2+bs*1]*y_2;
+		
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0;
+	
+	k = 2;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[2+bs*0] * x_2;
+		y_1 -= A[2+bs*1] * x_2;
+
+		y_0 -= A[3+bs*0] * x_3;
+		y_1 -= A[3+bs*1] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_1 -= A[0+bs*1] * x_0;
+
+			y_0 -= A[1+bs*0] * x_1;
+			y_1 -= A[1+bs*1] * x_1;
+			
+			y_0 -= A[2+bs*0] * x_2;
+			y_1 -= A[2+bs*1] * x_2;
+
+			y_0 -= A[3+bs*0] * x_3;
+			y_1 -= A[3+bs*1] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 2;
+		x += 2;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		y_1 -= A[0+bs*1] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+	y_1 = y[1] + y_1;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_1 *= inv_diag_A[1];
+	z[1] = y_1;
+
+	y_0 -= A[1+bs*0] * y_1;
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+	{
+
+	const int bs = 4;
+	
+	int
+		k;
+	
+	float *tA, *tx;
+	tA = A;
+	tx = x;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0;
+	
+	k = 1;
+	if(kmax>4)
+		{
+		// clean up at the beginning
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 -= A[1+bs*0] * x_1;
+		y_0 -= A[2+bs*0] * x_2;
+		y_0 -= A[3+bs*0] * x_3;
+
+		k=4;
+		A += 4 + (sda-1)*bs;
+		x += 4;
+		for(; k<kmax-3; k+=4)
+			{
+			
+			x_0 = x[0];
+			x_1 = x[1];
+			x_2 = x[2];
+			x_3 = x[3];
+			
+			y_0 -= A[0+bs*0] * x_0;
+			y_0 -= A[1+bs*0] * x_1;
+			y_0 -= A[2+bs*0] * x_2;
+			y_0 -= A[3+bs*0] * x_3;
+			
+			A += sda*bs;
+			x += 4;
+
+			}
+		}
+	else
+		{
+		A += 1;
+		x += 1;
+		}
+	for(; k<kmax; k++)
+		{
+		
+		x_0 = x[0];
+		
+		y_0 -= A[0+bs*0] * x_0;
+		
+		A += 1;//sda*bs;
+		x += 1;
+
+		}
+
+	y_0 = y[0] + y_0;
+
+	A = tA;
+	x = tx;
+
+	// top trinagle
+	y_0 *= inv_diag_A[0];
+	z[0] = y_0;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+	{
+
+	const int bs = 4;
+	
+	int k;
+
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+
+	y_0 += A[0+bs*0] * x_0;
+/*	y_1 += A[1+bs*0] * x_0;*/
+/*	y_2 += A[2+bs*0] * x_0;*/
+/*	y_3 += A[3+bs*0] * x_0;*/
+
+	y_0 += A[0+bs*1] * x_1;
+	y_1 += A[1+bs*1] * x_1;
+/*	y_2 += A[2+bs*1] * x_1;*/
+/*	y_3 += A[3+bs*1] * x_1;*/
+
+	y_0 += A[0+bs*2] * x_2;
+	y_1 += A[1+bs*2] * x_2;
+	y_2 += A[2+bs*2] * x_2;
+/*	y_3 += A[3+bs*2] * x_2;*/
+
+	y_0 += A[0+bs*3] * x_3;
+	y_1 += A[1+bs*3] * x_3;
+	y_2 += A[2+bs*3] * x_3;
+	y_3 += A[3+bs*3] * x_3;
+	
+	A += 4*bs;
+	x += 4;
+
+	k=4;
+	for(; k<kmax-3; k+=4)
+		{
+
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+
+		y_0 += A[0+bs*1] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[2+bs*1] * x_1;
+		y_3 += A[3+bs*1] * x_1;
+
+		y_0 += A[0+bs*2] * x_2;
+		y_1 += A[1+bs*2] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[3+bs*2] * x_2;
+
+		y_0 += A[0+bs*3] * x_3;
+		y_1 += A[1+bs*3] * x_3;
+		y_2 += A[2+bs*3] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += 4*bs;
+		x += 4;
+
+		}
+
+	for(; k<kmax; k++)
+		{
+
+		x_0 = x[0];
+
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[1+bs*0] * x_0;
+		y_2 += A[2+bs*0] * x_0;
+		y_3 += A[3+bs*0] * x_0;
+		
+		A += 1*bs;
+		x += 1;
+
+		}
+
+	z[0] = y_0;
+	z[1] = y_1;
+	z[2] = y_2;
+	z[3] = y_3;
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+	{
+
+	const int bs  = 4;
+	
+	int
+		k;
+	
+	float
+		x_0, x_1, x_2, x_3,
+		y_0=0, y_1=0, y_2=0, y_3=0;
+	
+	k=0;
+	for(; k<kmax-4; k+=4)
+		{
+		
+		x_0 = x[0];
+		x_1 = x[1];
+		x_2 = x[2];
+		x_3 = x[3];
+		
+		y_0 += A[0+bs*0] * x_0;
+		y_1 += A[0+bs*1] * x_0;
+		y_2 += A[0+bs*2] * x_0;
+		y_3 += A[0+bs*3] * x_0;
+
+		y_0 += A[1+bs*0] * x_1;
+		y_1 += A[1+bs*1] * x_1;
+		y_2 += A[1+bs*2] * x_1;
+		y_3 += A[1+bs*3] * x_1;
+		
+		y_0 += A[2+bs*0] * x_2;
+		y_1 += A[2+bs*1] * x_2;
+		y_2 += A[2+bs*2] * x_2;
+		y_3 += A[2+bs*3] * x_2;
+
+		y_0 += A[3+bs*0] * x_3;
+		y_1 += A[3+bs*1] * x_3;
+		y_2 += A[3+bs*2] * x_3;
+		y_3 += A[3+bs*3] * x_3;
+		
+		A += sda*bs;
+		x += 4;
+
+		}
+
+	x_0 = x[0];
+	x_1 = x[1];
+	x_2 = x[2];
+	x_3 = x[3];
+	
+	y_0 += A[0+bs*0] * x_0;
+	y_1 += A[0+bs*1] * x_0;
+	y_2 += A[0+bs*2] * x_0;
+	y_3 += A[0+bs*3] * x_0;
+
+/*	y_0 += A[1+bs*0] * x_1;*/
+	y_1 += A[1+bs*1] * x_1;
+	y_2 += A[1+bs*2] * x_1;
+	y_3 += A[1+bs*3] * x_1;
+	
+/*	y_0 += A[2+bs*0] * x_2;*/
+/*	y_1 += A[2+bs*1] * x_2;*/
+	y_2 += A[2+bs*2] * x_2;
+	y_3 += A[2+bs*3] * x_2;
+
+/*	y_0 += A[3+bs*0] * x_3;*/
+/*	y_1 += A[3+bs*1] * x_3;*/
+/*	y_2 += A[3+bs*2] * x_3;*/
+	y_3 += A[3+bs*3] * x_3;
+	
+//	A += sda*bs;
+//	x += 4;
+
+	// store_vs
+	store:
+	if(km>=4)
+		{
+		z[0] = y_0;
+		z[1] = y_1;
+		z[2] = y_2;
+		z[3] = y_3;
+		}
+	else
+		{
+		z[0] = y_0;
+		if(km>=2)
+			{
+			z[1] = y_1;
+			if(km>2)
+				{
+				z[2] = y_2;
+				}
+			}
+		}
+
+	}
+#endif
+	
+	
+	
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+	{
+	
+	kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+	}
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgetrf_pivot_4_lib4.c b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..fdec8de
--- /dev/null
+++ b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
@@ -0,0 +1,786 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_s_aux.h"
+
+
+
+// C numbering, starting from 0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
+	{
+
+	int idamax, ii;
+	float tmp, amax;
+		
+	p_idamax[0] = -1;
+	if(n<1)
+		return;
+
+	const int bs = 4;
+
+	int na = (bs - offset%bs)%bs;
+	na = n<na ? n : na;
+
+	amax = -1.0;
+	ii = 0;
+	if(na>0)
+		{
+		for( ; ii<na; ii++)
+			{
+			tmp = fabs(pA[0]);
+			if(tmp>amax)
+				{
+				idamax = ii+0;
+				amax = tmp;
+				}
+			pA += 1;
+			}
+		pA += bs*(sda-1);
+		}
+	for( ; ii<n-3; ii+=4)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		tmp = fabs(pA[1]);
+		if(tmp>amax)
+			{
+			idamax = ii+1;
+			amax = tmp;
+			}
+		tmp = fabs(pA[2]);
+		if(tmp>amax)
+			{
+			idamax = ii+2;
+			amax = tmp;
+			}
+		tmp = fabs(pA[3]);
+		if(tmp>amax)
+			{
+			idamax = ii+3;
+			amax = tmp;
+			}
+		pA += bs*sda;
+		}
+	for( ; ii<n; ii++)
+		{
+		tmp = fabs(pA[0]);
+		if(tmp>amax)
+			{
+			idamax = ii+0;
+			amax = tmp;
+			}
+		pA += 1;
+		}
+	
+	p_amax[0] = amax;
+	p_idamax[0] = idamax;
+
+	return;
+
+	}
+#endif
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
+	{
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	float
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	float
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		pA[1+bs*0] *= tmp0;
+		pA[2+bs*0] *= tmp0;
+		pA[3+bs*0] *= tmp0;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB[1+bs*0] *= tmp0;
+			pB[2+bs*0] *= tmp0;
+			pB[3+bs*0] *= tmp0;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*0] *= tmp0;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+
+	// second column
+	u_01  = pA[0+bs*1];
+	tmp1  = pA[1+bs*1];
+	tmp2  = pA[2+bs*1];
+	tmp3  = pA[3+bs*1];
+	tmp1 -= pA[1+bs*0] * u_01;
+	tmp2 -= pA[2+bs*0] * u_01;
+	tmp3 -= pA[3+bs*0] * u_01;
+	pA[1+bs*1] = tmp1;
+	pA[2+bs*1] = tmp2;
+	pA[3+bs*1] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*1];
+		tmp1  = pB[1+bs*1];
+		tmp2  = pB[2+bs*1];
+		tmp3  = pB[3+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		tmp1 -= pB[1+bs*0] * u_01;
+		tmp2 -= pB[2+bs*0] * u_01;
+		tmp3 -= pB[3+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB[1+bs*1] = tmp1;
+		pB[2+bs*1] = tmp2;
+		pB[3+bs*1] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0 = pB[0+bs*1];
+		tmp0 -= pB[0+bs*0] * u_01;
+		pB[0+bs*1] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		pA[2+bs*1] *= tmp1;
+		pA[3+bs*1] *= tmp1;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB[1+bs*1] *= tmp1;
+			pB[2+bs*1] *= tmp1;
+			pB[3+bs*1] *= tmp1;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*1] *= tmp1;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	// third column
+	u_02  = pA[0+bs*2];
+	u_12  = pA[1+bs*2];
+	u_12 -= pA[1+bs*0] * u_02;
+	pA[1+bs*2] = u_12;
+	tmp2  = pA[2+bs*2];
+	tmp3  = pA[3+bs*2];
+	tmp2 -= pA[2+bs*0] * u_02;
+	tmp3 -= pA[3+bs*0] * u_02;
+	tmp2 -= pA[2+bs*1] * u_12;
+	tmp3 -= pA[3+bs*1] * u_12;
+	pA[2+bs*2] = tmp2;
+	pA[3+bs*2] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp1  = pB[1+bs*2];
+		tmp2  = pB[2+bs*2];
+		tmp3  = pB[3+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp1 -= pB[1+bs*0] * u_02;
+		tmp2 -= pB[2+bs*0] * u_02;
+		tmp3 -= pB[3+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		tmp1 -= pB[1+bs*1] * u_12;
+		tmp2 -= pB[2+bs*1] * u_12;
+		tmp3 -= pB[3+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB[1+bs*2] = tmp1;
+		pB[2+bs*2] = tmp2;
+		pB[3+bs*2] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*2];
+		tmp0 -= pB[0+bs*0] * u_02;
+		tmp0 -= pB[0+bs*1] * u_12;
+		pB[0+bs*2] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+	ipiv[2] = idamax+2;
+	if(tmp2!=0)
+		{
+		if(ipiv[2]!=2)
+			srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+		tmp2 = 1.0 / pA[2+bs*2];
+		inv_diag_A[2] = tmp2;
+		pA[3+bs*2] *= tmp2;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB[1+bs*2] *= tmp2;
+			pB[2+bs*2] *= tmp2;
+			pB[3+bs*2] *= tmp2;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*2] *= tmp2;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[2] = 0.0;
+		}
+
+	// fourth column
+	u_03  = pA[0+bs*3];
+	u_13  = pA[1+bs*3];
+	u_13 -= pA[1+bs*0] * u_03;
+	pA[1+bs*3] = u_13;
+	u_23  = pA[2+bs*3];
+	u_23 -= pA[2+bs*0] * u_03;
+	u_23 -= pA[2+bs*1] * u_13;
+	pA[2+bs*3] = u_23;
+	tmp3  = pA[3+bs*3];
+	tmp3 -= pA[3+bs*0] * u_03;
+	tmp3 -= pA[3+bs*1] * u_13;
+	tmp3 -= pA[3+bs*2] * u_23;
+	pA[3+bs*3] = tmp3;
+	pB = pA + bs*sda;
+	for(k=0; k<ma-3; k+=4)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp1  = pB[1+bs*3];
+		tmp2  = pB[2+bs*3];
+		tmp3  = pB[3+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp1 -= pB[1+bs*0] * u_03;
+		tmp2 -= pB[2+bs*0] * u_03;
+		tmp3 -= pB[3+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp1 -= pB[1+bs*1] * u_13;
+		tmp2 -= pB[2+bs*1] * u_13;
+		tmp3 -= pB[3+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		tmp1 -= pB[1+bs*2] * u_23;
+		tmp2 -= pB[2+bs*2] * u_23;
+		tmp3 -= pB[3+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB[1+bs*3] = tmp1;
+		pB[2+bs*3] = tmp2;
+		pB[3+bs*3] = tmp3;
+		pB += bs*sda;
+		}
+	for( ; k<ma; k++)
+		{
+		tmp0  = pB[0+bs*3];
+		tmp0 -= pB[0+bs*0] * u_03;
+		tmp0 -= pB[0+bs*1] * u_13;
+		tmp0 -= pB[0+bs*2] * u_23;
+		pB[0+bs*3] = tmp0;
+		pB += 1;
+		}
+
+	sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+	ipiv[3] = idamax+3;
+	if(tmp3!=0)
+		{
+		if(ipiv[3]!=3)
+			srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+		tmp3 = 1.0 / pA[3+bs*3];
+		inv_diag_A[3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB[1+bs*3] *= tmp3;
+			pB[2+bs*3] *= tmp3;
+			pB[3+bs*3] *= tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			pB[0+bs*3] *= tmp3;
+			pB += 1;
+			}
+		}
+	else
+		{
+		inv_diag_A[3] = 0.0;
+		}
+	
+	return;
+
+	}
+#endif
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
+	{
+
+	if(m<=0 || n<=0)
+		return;
+
+	const int bs = 4;
+
+	// assume m>=4
+	int ma = m-4;
+
+	float
+		tmp0, tmp1, tmp2, tmp3,
+		u_00, u_01, u_02, u_03,
+		      u_11, u_12, u_13,
+		            u_22, u_23,
+		                  u_33;
+	
+	float
+		*pB;
+	
+	int 
+		k, idamax;
+	
+	// first column
+
+	// find pivot & scale
+	sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+	ipiv[0] = idamax;
+	if(tmp0!=0.0)
+		{
+		if(ipiv[0]!=0)
+			srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+		tmp0 = 1.0 / pA[0+bs*0];
+		inv_diag_A[0] = tmp0;
+		if(m>=4)
+			{
+			pA[1+bs*0] *= tmp0;
+			pA[2+bs*0] *= tmp0;
+			pA[3+bs*0] *= tmp0;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB[1+bs*0] *= tmp0;
+				pB[2+bs*0] *= tmp0;
+				pB[3+bs*0] *= tmp0;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*0] *= tmp0;
+				pB += 1;
+				}
+			}
+		else // m = {1,2,3}
+			{
+			if(m>1)
+				{
+				pA[1+bs*0] *= tmp0;
+				if(m>2)
+					pA[2+bs*0] *= tmp0;
+				}
+			}
+		}
+	else
+		{
+		inv_diag_A[0] = 0.0;
+		}
+	
+	if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+		return;
+
+	// second column
+
+	// correct
+	if(m>=4)
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp2  = pA[2+bs*1];
+		tmp3  = pA[3+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		tmp2 -= pA[2+bs*0] * u_01;
+		tmp3 -= pA[3+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		pA[2+bs*1] = tmp2;
+		pA[3+bs*1] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*1];
+			tmp1  = pB[1+bs*1];
+			tmp2  = pB[2+bs*1];
+			tmp3  = pB[3+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			tmp1 -= pB[1+bs*0] * u_01;
+			tmp2 -= pB[2+bs*0] * u_01;
+			tmp3 -= pB[3+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB[1+bs*1] = tmp1;
+			pB[2+bs*1] = tmp2;
+			pB[3+bs*1] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0 = pB[0+bs*1];
+			tmp0 -= pB[0+bs*0] * u_01;
+			pB[0+bs*1] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_01  = pA[0+bs*1];
+		tmp1  = pA[1+bs*1];
+		tmp1 -= pA[1+bs*0] * u_01;
+		pA[1+bs*1] = tmp1;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*1];
+			tmp2 -= pA[2+bs*0] * u_01;
+			pA[2+bs*1] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+	ipiv[1] = idamax+1;
+	if(tmp1!=0)
+		{
+		if(ipiv[1]!=1)
+			srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+		tmp1 = 1.0 / pA[1+bs*1];
+		inv_diag_A[1] = tmp1;
+		if(m>=4)
+			{
+			pA[2+bs*1] *= tmp1;
+			pA[3+bs*1] *= tmp1;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB[1+bs*1] *= tmp1;
+				pB[2+bs*1] *= tmp1;
+				pB[3+bs*1] *= tmp1;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*1] *= tmp1;
+				pB += 1;
+				}
+			}
+		else // m = {2,3}
+			{
+			if(m>2)
+				pA[2+bs*1] *= tmp1;
+			}
+		}
+	else
+		{
+		inv_diag_A[1] = 0.0;
+		}
+
+	if(n==2)
+		return;
+
+	// third column
+
+	// correct
+	if(m>=4)
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		tmp2  = pA[2+bs*2];
+		tmp3  = pA[3+bs*2];
+		tmp2 -= pA[2+bs*0] * u_02;
+		tmp3 -= pA[3+bs*0] * u_02;
+		tmp2 -= pA[2+bs*1] * u_12;
+		tmp3 -= pA[3+bs*1] * u_12;
+		pA[2+bs*2] = tmp2;
+		pA[3+bs*2] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp1  = pB[1+bs*2];
+			tmp2  = pB[2+bs*2];
+			tmp3  = pB[3+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp1 -= pB[1+bs*0] * u_02;
+			tmp2 -= pB[2+bs*0] * u_02;
+			tmp3 -= pB[3+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			tmp1 -= pB[1+bs*1] * u_12;
+			tmp2 -= pB[2+bs*1] * u_12;
+			tmp3 -= pB[3+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB[1+bs*2] = tmp1;
+			pB[2+bs*2] = tmp2;
+			pB[3+bs*2] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*2];
+			tmp0 -= pB[0+bs*0] * u_02;
+			tmp0 -= pB[0+bs*1] * u_12;
+			pB[0+bs*2] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_02  = pA[0+bs*2];
+		u_12  = pA[1+bs*2];
+		u_12 -= pA[1+bs*0] * u_02;
+		pA[1+bs*2] = u_12;
+		if(m>2)
+			{
+			tmp2  = pA[2+bs*2];
+			tmp2 -= pA[2+bs*0] * u_02;
+			tmp2 -= pA[2+bs*1] * u_12;
+			pA[2+bs*2] = tmp2;
+			}
+		}
+
+	// find pivot & scale
+	if(m>2)
+		{
+		sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+		ipiv[2] = idamax+2;
+		if(tmp2!=0)
+			{
+			if(ipiv[2]!=2)
+				srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+			tmp2 = 1.0 / pA[2+bs*2];
+			inv_diag_A[2] = tmp2;
+			if(m>=4)
+				{
+				pA[3+bs*2] *= tmp2;
+				pB = pA + bs*sda;
+				for(k=0; k<ma-3; k+=4)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB[1+bs*2] *= tmp2;
+					pB[2+bs*2] *= tmp2;
+					pB[3+bs*2] *= tmp2;
+					pB += bs*sda;
+					}
+				for( ; k<ma; k++)
+					{
+					pB[0+bs*2] *= tmp2;
+					pB += 1;
+					}
+				}
+			}
+		else
+			{
+			inv_diag_A[2] = 0.0;
+			}
+		}
+
+	if(n<4)
+		return;
+
+	// fourth column
+
+	// correct
+	if(m>=4)
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		u_23  = pA[2+bs*3];
+		u_23 -= pA[2+bs*0] * u_03;
+		u_23 -= pA[2+bs*1] * u_13;
+		pA[2+bs*3] = u_23;
+		tmp3  = pA[3+bs*3];
+		tmp3 -= pA[3+bs*0] * u_03;
+		tmp3 -= pA[3+bs*1] * u_13;
+		tmp3 -= pA[3+bs*2] * u_23;
+		pA[3+bs*3] = tmp3;
+		pB = pA + bs*sda;
+		for(k=0; k<ma-3; k+=4)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp1  = pB[1+bs*3];
+			tmp2  = pB[2+bs*3];
+			tmp3  = pB[3+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp1 -= pB[1+bs*0] * u_03;
+			tmp2 -= pB[2+bs*0] * u_03;
+			tmp3 -= pB[3+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp1 -= pB[1+bs*1] * u_13;
+			tmp2 -= pB[2+bs*1] * u_13;
+			tmp3 -= pB[3+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			tmp1 -= pB[1+bs*2] * u_23;
+			tmp2 -= pB[2+bs*2] * u_23;
+			tmp3 -= pB[3+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB[1+bs*3] = tmp1;
+			pB[2+bs*3] = tmp2;
+			pB[3+bs*3] = tmp3;
+			pB += bs*sda;
+			}
+		for( ; k<ma; k++)
+			{
+			tmp0  = pB[0+bs*3];
+			tmp0 -= pB[0+bs*0] * u_03;
+			tmp0 -= pB[0+bs*1] * u_13;
+			tmp0 -= pB[0+bs*2] * u_23;
+			pB[0+bs*3] = tmp0;
+			pB += 1;
+			}
+		}
+	else // m = {2,3}
+		{
+		u_03  = pA[0+bs*3];
+		u_13  = pA[1+bs*3];
+		u_13 -= pA[1+bs*0] * u_03;
+		pA[1+bs*3] = u_13;
+		if(m>2)
+			{
+			u_23  = pA[2+bs*3];
+			u_23 -= pA[2+bs*0] * u_03;
+			u_23 -= pA[2+bs*1] * u_13;
+			pA[2+bs*3] = u_23;
+			}
+		}
+
+	if(m>3)
+		{
+		// find pivot & scale
+		sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+		ipiv[3] = idamax+3;
+		if(tmp3!=0)
+			{
+			if(ipiv[3]!=3)
+				srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+			tmp3 = 1.0 / pA[3+bs*3];
+			inv_diag_A[3] = tmp3;
+			pB = pA + bs*sda;
+			for(k=0; k<ma-3; k+=4)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB[1+bs*3] *= tmp3;
+				pB[2+bs*3] *= tmp3;
+				pB[3+bs*3] *= tmp3;
+				pB += bs*sda;
+				}
+			for( ; k<ma; k++)
+				{
+				pB[0+bs*3] *= tmp3;
+				pB += 1;
+				}
+			}
+		else
+			{
+			inv_diag_A[3] = 0.0;
+			}
+		}
+	
+	return;
+
+	}
+#endif
+
+
+	
+
+
+
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	const int bs = 4;
+
+	int k;
+
+	float
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha_n[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha_n[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha_n[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha_n[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	// store t
+	z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+	if(km>1)
+		{
+		z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+		if(km>2)
+			{
+			z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+			if(km>3)
+				{
+				z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+	{
+
+	kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+	{
+
+	if(kmax<=0) 
+		return;
+	
+	float *x_t = x_n;
+	float *z_t = z_n;
+
+	const int bs = 4;
+
+	int k;
+
+	float
+		a_00, a_01, a_02, a_03,
+		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+	
+	x_n_0 = 0;
+	x_n_1 = 0;
+	x_n_2 = 0;
+	x_n_3 = 0;
+
+	x_n_0 = alpha[0]*x_n[0];
+	if(km>1)
+		{
+		x_n_1 = alpha[0]*x_n[1];
+		if(km>2)
+			{
+			x_n_2 = alpha[0]*x_n[2];
+			if(km>3)
+				{
+				x_n_3 = alpha[0]*x_n[3];
+				}
+			}
+		}
+
+	y_t_0 = 0;
+	y_t_1 = 0;
+	y_t_2 = 0;
+	y_t_3 = 0;
+
+	k = 0;
+	if(offA==0)
+		{
+		if(kmax<4)
+			{
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+			if(kmax==1)
+				goto store_t;
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+			if(kmax==2)
+				goto store_t;
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+			goto store_t;
+			}
+		else
+			{
+
+			// 0
+
+			x_t_0 = x_t[0];
+
+			a_00 = A[0+bs*0];
+			
+			y_t_0 += a_00 * x_t_0;
+
+
+			// 1
+
+			y_n_0 = z_n[1]; 
+			x_t_0 = x_t[1];
+
+			a_00 = A[1+bs*0];
+			a_01 = A[1+bs*1];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_t_1 += a_01 * x_t_0;
+
+			z_n[1] = y_n_0;
+
+
+			// 2
+
+			y_n_0 = z_n[2]; 
+			x_t_0 = x_t[2];
+
+			a_00 = A[2+bs*0];
+			a_01 = A[2+bs*1];
+			a_02 = A[2+bs*2];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_t_2 += a_02 * x_t_0;
+
+			z_n[2] = y_n_0;
+
+
+			// 3
+
+			y_n_0 = z_n[3]; 
+			x_t_0 = x_t[3];
+
+			a_00 = A[3+bs*0];
+			a_01 = A[3+bs*1];
+			a_02 = A[3+bs*2];
+			a_03 = A[3+bs*3];
+			
+			y_n_0 += a_00 * x_n_0;
+			y_t_0 += a_00 * x_t_0;
+			y_n_0 += a_01 * x_n_1;
+			y_t_1 += a_01 * x_t_0;
+			y_n_0 += a_02 * x_n_2;
+			y_t_2 += a_02 * x_t_0;
+			y_t_3 += a_03 * x_t_0;
+
+			z_n[3] = y_n_0;
+
+			k += 4;
+			A += sda*bs;
+			z_n += 4;
+			x_t += 4;
+
+			}
+		}
+	else if(offA==1)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==6)
+			goto store_t;
+
+		// 6
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==7)
+			goto store_t;
+
+		k += 7;
+
+		}
+	else if(offA==2)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==5)
+			goto store_t;
+
+		// 5
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==6)
+			goto store_t;
+
+		k += 6;
+
+		}
+	else // if(offA==3)
+		{
+
+		// 0
+
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		
+		y_t_0 += a_00 * x_t_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==1)
+			goto store_t;
+
+		// 1
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_t_1 += a_01 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==2)
+			goto store_t;
+
+		// 2
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_t_2 += a_02 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==3)
+			goto store_t;
+
+		// 3
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		if(kmax==4)
+			goto store_t;
+
+		// 4
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		A += (sda-1)*bs; // new panel
+
+		if(kmax==5)
+			goto store_t;
+
+		k += 5;
+
+		}
+	for(; k<kmax-3; k+=bs)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+
+		// 1
+
+		y_n_0 = z_n[1]; 
+		x_t_0 = x_t[1];
+
+		a_00 = A[1+bs*0];
+		a_01 = A[1+bs*1];
+		a_02 = A[1+bs*2];
+		a_03 = A[1+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[1] = y_n_0;
+
+
+		// 2
+
+		y_n_0 = z_n[2]; 
+		x_t_0 = x_t[2];
+
+		a_00 = A[2+bs*0];
+		a_01 = A[2+bs*1];
+		a_02 = A[2+bs*2];
+		a_03 = A[2+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[2] = y_n_0;
+
+
+		// 3
+
+		y_n_0 = z_n[3]; 
+		x_t_0 = x_t[3];
+
+		a_00 = A[3+bs*0];
+		a_01 = A[3+bs*1];
+		a_02 = A[3+bs*2];
+		a_03 = A[3+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[3] = y_n_0;
+
+
+		A += sda*bs;
+		z_n += 4;
+		x_t += 4;
+
+		}
+	for(; k<kmax; k++)
+		{
+
+		// 0
+
+		y_n_0 = z_n[0]; 
+		x_t_0 = x_t[0];
+
+		a_00 = A[0+bs*0];
+		a_01 = A[0+bs*1];
+		a_02 = A[0+bs*2];
+		a_03 = A[0+bs*3];
+		
+		y_n_0 += a_00 * x_n_0;
+		y_t_0 += a_00 * x_t_0;
+		y_n_0 += a_01 * x_n_1;
+		y_t_1 += a_01 * x_t_0;
+		y_n_0 += a_02 * x_n_2;
+		y_t_2 += a_02 * x_t_0;
+		y_n_0 += a_03 * x_n_3;
+		y_t_3 += a_03 * x_t_0;
+
+		z_n[0] = y_n_0;
+
+		A += 1;
+		z_n += 1;
+		x_t += 1;
+
+		}
+	
+	store_t:
+	z_t[0] += alpha[0]*y_t_0;
+	if(km>1)
+		{
+		z_t[1] += alpha[0]*y_t_1;
+		if(km>2)
+			{
+			z_t[2] += alpha[0]*y_t_2;
+			if(km>3)
+				{
+				z_t[3] += alpha[0]*y_t_3;
+				}
+			}
+		}
+
+	return;
+
+	}
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+	{
+
+	kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+	return;
+
+	}
+#endif
+
+
+
+
+
diff --git a/kernel/fma/Makefile b/kernel/fma/Makefile
new file mode 100644
index 0000000..d7be280
--- /dev/null
+++ b/kernel/fma/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/fma/kernel_dgemm_4x4_lib4.S b/kernel/fma/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..a02f37d
--- /dev/null
+++ b/kernel/fma/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3895 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp);
+#define EPILOGUE \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp);
+#define EPILOGUE \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	subl	$4, %r10d
+
+
+	// unroll 1
+	vmovapd 		32(%r11), %xmm8 // A[4]
+	vmovapd 		48(%r11), %xmm9 // A[6]
+
+	vmovddup		32(%r12), %xmm12 // B[4]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		40(%r12), %xmm12 // B[5]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		48(%r12), %xmm12 // B[6]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		56(%r12), %xmm12 // B[7]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	// unroll 2
+	vmovapd 		64(%r11), %xmm8 // A[8]
+	vmovapd 		80(%r11), %xmm9 // A[10]
+
+	vmovddup		64(%r12), %xmm12 // B[8]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		72(%r12), %xmm12 // B[9]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		80(%r12), %xmm12 // B[10]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		88(%r12), %xmm12 // B[11]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	// unroll 3
+	vmovapd 		96(%r11), %xmm8 // A[12]
+	vmovapd 		112(%r11), %xmm9 // A[14]
+
+	vmovddup		96(%r12), %xmm12 // B[12]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		104(%r12), %xmm12 // B[13]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		112(%r12), %xmm12 // B[14]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		120(%r12), %xmm12 // B[15]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	addq	$128, %r11
+	addq	$128, %r12
+
+
+	cmpl	$4, %r10d
+
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	// unroll 1
+	vmovapd 		32(%r11), %xmm8 // A[4]
+	vmovapd 		48(%r11), %xmm9 // A[6]
+
+	vmovddup		32(%r12), %xmm12 // B[4]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		40(%r12), %xmm12 // B[5]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		48(%r12), %xmm12 // B[6]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		56(%r12), %xmm12 // B[7]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	// unroll 2
+	vmovapd 		64(%r11), %xmm8 // A[8]
+	vmovapd 		80(%r11), %xmm9 // A[10]
+
+	vmovddup		64(%r12), %xmm12 // B[8]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		72(%r12), %xmm12 // B[9]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		80(%r12), %xmm12 // B[10]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		88(%r12), %xmm12 // B[11]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	// unroll 3
+	vmovapd 		96(%r11), %xmm8 // A[12]
+	vmovapd 		112(%r11), %xmm9 // A[14]
+
+	vmovddup		96(%r12), %xmm12 // B[12]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovddup		104(%r12), %xmm12 // B[13]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		112(%r12), %xmm12 // B[14]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		120(%r12), %xmm12 // B[15]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+
+	addq	$128, %r12
+	addq	$128, %r11
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	subl	$1, %r10d
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd	%xmm9, %xmm12, %xmm7
+
+	addq	$32, %r11
+	addq	$32, %r12
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	subl	$4, %r10d
+
+
+	// unroll 1
+	vmovapd 		32(%r11), %xmm8 // A[4]
+	vmovapd 		48(%r11), %xmm9 // A[6]
+
+	vmovddup		32(%r12), %xmm12 // B[4]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		40(%r12), %xmm12 // B[5]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		48(%r12), %xmm12 // B[6]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		56(%r12), %xmm12 // B[7]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	// unroll 2
+	vmovapd 		64(%r11), %xmm8 // A[8]
+	vmovapd 		80(%r11), %xmm9 // A[10]
+
+	vmovddup		64(%r12), %xmm12 // B[8]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		72(%r12), %xmm12 // B[9]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		80(%r12), %xmm12 // B[10]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		88(%r12), %xmm12 // B[11]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	// unroll 3
+	vmovapd 		96(%r11), %xmm8 // A[12]
+	vmovapd 		112(%r11), %xmm9 // A[14]
+
+	vmovddup		96(%r12), %xmm12 // B[12]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		104(%r12), %xmm12 // B[13]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		112(%r12), %xmm12 // B[14]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		120(%r12), %xmm12 // B[15]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	addq	$128, %r12
+	addq	$128, %r11
+
+
+	cmpl	$4, %r10d
+
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	// unroll 1
+	vmovapd 		32(%r11), %xmm8 // A[4]
+	vmovapd 		48(%r11), %xmm9 // A[6]
+
+	vmovddup		32(%r12), %xmm12 // B[4]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		40(%r12), %xmm12 // B[5]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		48(%r12), %xmm12 // B[6]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		56(%r12), %xmm12 // B[7]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	// unroll 2
+	vmovapd 		64(%r11), %xmm8 // A[8]
+	vmovapd 		80(%r11), %xmm9 // A[10]
+
+	vmovddup		64(%r12), %xmm12 // B[8]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		72(%r12), %xmm12 // B[9]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		80(%r12), %xmm12 // B[10]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		88(%r12), %xmm12 // B[11]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	// unroll 3
+	vmovapd 		96(%r11), %xmm8 // A[12]
+	vmovapd 		112(%r11), %xmm9 // A[14]
+
+	vmovddup		96(%r12), %xmm12 // B[12]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	vmovddup		104(%r12), %xmm12 // B[13]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		112(%r12), %xmm12 // B[14]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		120(%r12), %xmm12 // B[15]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+
+	addq	$128, %r12
+	addq	$128, %r11
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	// unroll 0
+	vmovapd 		0(%r11), %xmm8 // A[0]
+	vmovapd 		16(%r11), %xmm9 // A[2]
+
+	vmovddup		0(%r12), %xmm12 // B[0]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm0
+	vfnmadd231pd	%xmm9, %xmm12, %xmm1
+
+	subl	$1, %r10d
+
+	vmovddup		8(%r12), %xmm12 // B[1]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm2
+	vfnmadd231pd	%xmm9, %xmm12, %xmm3
+
+	vmovddup		16(%r12), %xmm12 // B[2]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm4
+	vfnmadd231pd	%xmm9, %xmm12, %xmm5
+
+	vmovddup		24(%r12), %xmm12 // B[3]
+	vfnmadd231pd	%xmm8, %xmm12, %xmm6
+	vfnmadd231pd	%xmm9, %xmm12, %xmm7
+
+	addq	$32, %r12
+	addq	$32, %r11
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r10), %xmm8
+	vmovapd			16(%r10), %xmm9
+	vmovddup		0(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+
+	vmovapd			32(%r10), %xmm8
+	vmovapd			48(%r10), %xmm9
+	vmovddup		32(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	vmovddup		40(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+
+	vmovapd			64(%r10), %xmm8
+	vmovapd			80(%r10), %xmm9
+	vmovddup		64(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	vmovddup		72(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+	vmovddup		80(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+
+	vmovapd			96(%r10), %xmm8
+	vmovapd			112(%r10), %xmm9
+	vmovddup		96(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	vmovddup		104(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+	vmovddup		112(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+	vmovddup		120(%r11), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+	
+	vmovapd			0(%r11), %xmm8
+	vmovapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	vmovddup		0(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %xmm8
+	vmovapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	vmovddup		0(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	addq			$32, %r11
+	vmovddup		8(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %xmm8
+	vmovapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	vmovddup		0(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	vmovddup		8(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+	addq			$32, %r11
+	vmovddup		16(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	vmovapd			0(%r11), %xmm8
+	vmovapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	vmovddup		0(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm0
+	vfmadd231pd		%xmm9, %xmm12, %xmm1
+	vmovddup		8(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm2
+	vfmadd231pd		%xmm9, %xmm12, %xmm3
+	vmovddup		16(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm4
+	vfmadd231pd		%xmm9, %xmm12, %xmm5
+	addq			$32, %r11
+	vmovddup		24(%r12), %xmm12
+	vfmadd231pd		%xmm8, %xmm12, %xmm6
+	vfmadd231pd		%xmm9, %xmm12, %xmm7
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+	
+	// XXX nothing to blend
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// XXX nothing to blend
+
+	// alpha
+	movddup	0(%r10), %xmm15
+
+	mulpd	%xmm15, %xmm0
+	mulpd	%xmm15, %xmm1
+	mulpd	%xmm15, %xmm2
+	mulpd	%xmm15, %xmm3
+
+
+	// beta
+	movddup	0(%r11), %xmm14
+
+
+	vmovapd		0(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm0
+	vmovapd		16(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm1
+	vmovapd		32(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm2
+	vmovapd		48(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm3
+	vmovapd		64(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm4
+	vmovapd		80(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm5
+	vmovapd		96(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm6
+	vmovapd		112(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	movddup	0(%r10), %xmm15
+
+	mulpd	%xmm15, %xmm0
+	mulpd	%xmm15, %xmm1
+	mulpd	%xmm15, %xmm2
+	mulpd	%xmm15, %xmm3
+
+
+	// beta
+	movddup	0(%r11), %xmm14
+
+
+	vmovapd		0(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm0
+	vmovapd		16(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm1
+	vmovapd		32(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm2
+	vmovapd		48(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm3
+	vmovapd		64(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm4
+	vmovapd		80(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm5
+	vmovapd		96(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm6
+	vmovapd		112(%r12), %xmm15
+	vfmadd231pd	%xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif	
+	
+	vmovapd		0(%r10), %xmm15
+	vaddpd		%xmm0, %xmm15, %xmm0
+	vmovapd		16(%r10), %xmm15
+	vaddpd		%xmm1, %xmm15, %xmm1
+	vmovapd		32(%r10), %xmm15
+	vaddpd		%xmm2, %xmm15, %xmm2
+	vmovapd		48(%r10), %xmm15
+	vaddpd		%xmm3, %xmm15, %xmm3
+	vmovapd		64(%r10), %xmm15
+	vaddpd		%xmm4, %xmm15, %xmm4
+	vmovapd		80(%r10), %xmm15
+	vaddpd		%xmm5, %xmm15, %xmm5
+	vmovapd		96(%r10), %xmm15
+	vaddpd		%xmm6, %xmm15, %xmm6
+	vmovapd		112(%r10), %xmm15
+	vaddpd		%xmm7, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+	
+	vxorpd			%xmm15, %xmm15, %xmm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	vmovsd			.LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+	vmovsd			LC04(%rip), %xmm14 // 1.0
+#endif
+
+	vmovsd			%xmm0, %xmm0, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+2:
+	cmpl			$2, %r11d
+	vmovsd			%xmm13, 0(%r10)
+	vmovddup		%xmm13, %xmm13
+	vmulpd			%xmm0, %xmm13, %xmm0
+	vmulpd			%xmm1, %xmm13, %xmm1
+
+	jl				0f // ret
+
+	vpermilpd		$0x3, %xmm0, %xmm13
+	vfnmadd231pd	%xmm0, %xmm13, %xmm2
+	vfnmadd231pd	%xmm1, %xmm13, %xmm3
+	vpermilpd		$0x3, %xmm2, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+4:
+	cmpl			$3, %r11d
+	vmovsd			%xmm13, 8(%r10)
+	vmovddup		%xmm13, %xmm13
+	vmulpd			%xmm2, %xmm13, %xmm2
+	vmulpd			%xmm3, %xmm13, %xmm3
+
+	jl				0f // ret
+
+	vpermilpd		$0x0, %xmm1, %xmm13
+//	vfnmadd231pd	%xmm0, %xmm13, %xmm4
+	vfnmadd231pd	%xmm1, %xmm13, %xmm5
+	vpermilpd		$0x0, %xmm3, %xmm13
+//	vfnmadd231pd	%xmm2, %xmm13, %xmm4
+	vfnmadd231pd	%xmm3, %xmm13, %xmm5
+	vmovaps			%xmm5, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+6:
+	cmpl			$4, %r11d
+	vmovsd			%xmm13, 16(%r10)
+	vmovddup		%xmm13, %xmm13
+//	vmulpd			%xmm4, %xmm13, %xmm4
+	vmulpd			%xmm5, %xmm13, %xmm5
+
+	jl				0f // ret
+
+	vpermilpd		$0x3, %xmm1, %xmm13
+//	vfnmadd231pd	%xmm0, %xmm13, %xmm6
+	vfnmadd231pd	%xmm1, %xmm13, %xmm7
+	vpermilpd		$0x3, %xmm3, %xmm13
+//	vfnmadd231pd	%xmm2, %xmm13, %xmm6
+	vfnmadd231pd	%xmm3, %xmm13, %xmm7
+	vpermilpd		$0x3, %xmm5, %xmm13
+//	vfnmadd231pd	%xmm4, %xmm13, %xmm6
+	vfnmadd231pd	%xmm5, %xmm13, %xmm7
+	vpermilpd		$0x3, %xmm7, %xmm13
+	vucomisd		%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	vsqrtsd			%xmm13, %xmm13, %xmm13
+	vdivsd			%xmm13, %xmm14, %xmm13
+8:
+	vmovsd			%xmm13, 24(%r10)
+	vmovddup		%xmm13, %xmm13
+//	vmulpd			%xmm6, %xmm13, %xmm6
+	vmulpd			%xmm7, %xmm13, %xmm7
+
+	jmp		0f
+	
+1:
+	vxorpd	%xmm13, %xmm13, %xmm13
+	jmp		2b
+
+3:
+	vxorpd	%xmm13, %xmm13, %xmm13
+	jmp		4b
+
+5:
+	vxorpd	%xmm13, %xmm13, %xmm13
+	jmp		6b
+
+7:
+	vxorpd	%xmm13, %xmm13, %xmm13
+	jmp		8b
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	vmovddup		0(%r11), %xmm13
+	vmulpd			%xmm0, %xmm13, %xmm0
+	vmulpd			%xmm1, %xmm13, %xmm1
+
+	vmovddup		8(%r10), %xmm13
+	vfnmadd231pd	%xmm0, %xmm13, %xmm2
+	vfnmadd231pd	%xmm1, %xmm13, %xmm3
+	vmovddup		8(%r11), %xmm13
+	vmulpd			%xmm2, %xmm13, %xmm2
+	vmulpd			%xmm3, %xmm13, %xmm3
+
+	vmovddup		16(%r10), %xmm13
+	vfnmadd231pd	%xmm0, %xmm13, %xmm4
+	vfnmadd231pd	%xmm1, %xmm13, %xmm5
+	vmovddup		48(%r10), %xmm13
+	vfnmadd231pd	%xmm2, %xmm13, %xmm4
+	vfnmadd231pd	%xmm3, %xmm13, %xmm5
+	vmovddup		16(%r11), %xmm13
+	vmulpd			%xmm4, %xmm13, %xmm4
+	vmulpd			%xmm5, %xmm13, %xmm5
+
+	vmovddup		24(%r10), %xmm13
+	vfnmadd231pd	%xmm0, %xmm13, %xmm6
+	vfnmadd231pd	%xmm1, %xmm13, %xmm7
+	vmovddup		56(%r10), %xmm13
+	vfnmadd231pd	%xmm2, %xmm13, %xmm6
+	vfnmadd231pd	%xmm3, %xmm13, %xmm7
+	vmovddup		88(%r10), %xmm13
+	vfnmadd231pd	%xmm4, %xmm13, %xmm6
+	vfnmadd231pd	%xmm5, %xmm13, %xmm7
+	vmovddup		24(%r11), %xmm13
+	vmulpd			%xmm6, %xmm13, %xmm6
+	vmulpd			%xmm7, %xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	vmovddup		0(%r11), %xmm13
+	cmpl			$2, %r12d
+	vmulpd			%xmm0, %xmm13, %xmm0
+	vmulpd			%xmm1, %xmm13, %xmm1
+
+	jl				0f // ret
+
+	vmovddup		8(%r10), %xmm13
+	cmpl			$3, %r12d
+	vfnmadd231pd	%xmm0, %xmm13, %xmm2
+	vfnmadd231pd	%xmm1, %xmm13, %xmm3
+	vmovddup		8(%r11), %xmm13
+	vmulpd			%xmm2, %xmm13, %xmm2
+	vmulpd			%xmm3, %xmm13, %xmm3
+
+	jl				0f // ret
+
+	vmovddup		16(%r10), %xmm13
+	cmpl			$4, %r12d
+	vfnmadd231pd	%xmm0, %xmm13, %xmm4
+	vfnmadd231pd	%xmm1, %xmm13, %xmm5
+	vmovddup		48(%r10), %xmm13
+	vfnmadd231pd	%xmm2, %xmm13, %xmm4
+	vfnmadd231pd	%xmm3, %xmm13, %xmm5
+	vmovddup		16(%r11), %xmm13
+	vmulpd			%xmm4, %xmm13, %xmm4
+	vmulpd			%xmm5, %xmm13, %xmm5
+
+	jl				0f // ret
+
+	vmovddup		24(%r10), %xmm13
+	vfnmadd231pd	%xmm0, %xmm13, %xmm6
+	vfnmadd231pd	%xmm1, %xmm13, %xmm7
+	vmovddup		56(%r10), %xmm13
+	vfnmadd231pd	%xmm2, %xmm13, %xmm6
+	vfnmadd231pd	%xmm3, %xmm13, %xmm7
+	vmovddup		88(%r10), %xmm13
+	vfnmadd231pd	%xmm4, %xmm13, %xmm6
+	vfnmadd231pd	%xmm5, %xmm13, %xmm7
+	vmovddup		24(%r11), %xmm13
+	vmulpd			%xmm6, %xmm13, %xmm6
+	vmulpd			%xmm7, %xmm13, %xmm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd %xmm0,   0(%r10)
+	vmovapd %xmm1,  16(%r10)
+	vmovapd %xmm2,  32(%r10)
+	vmovapd %xmm3,  48(%r10)
+	vmovapd %xmm4,  64(%r10)
+	vmovapd %xmm5,  80(%r10)
+	vmovapd %xmm6,  96(%r10)
+	vmovapd %xmm7, 112(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// TODO use blendv instead
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl		$2, %r11d
+	jg			1f
+	je			0f
+
+	// km==1
+	cmpl		$2, %r12d
+	vmovsd		%xmm0,  0(%r10)
+	jl			4f // end
+	cmpl		$3, %r12d
+	vmovsd		%xmm2, 32(%r10)
+	jl			4f // end
+	vmovsd		%xmm4, 64(%r10)
+	je			4f // end
+	vmovsd		%xmm6, 96(%r10)
+
+	jmp		4f
+
+0:
+	// km==2
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,  0(%r10)
+	jl			4f // end
+	cmpl		$3, %r12d
+	vmovapd		%xmm2, 32(%r10)
+	jl			4f // end
+	vmovapd		%xmm4, 64(%r10)
+	je			4f // end
+	vmovapd		%xmm6, 96(%r10)
+
+	jmp		4f
+
+1:
+	cmpl		$3, %r11d
+	jg			2f
+
+	// km==3
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,   0(%r10)
+	vmovsd		%xmm1,  16(%r10)
+	jl			4f // end
+	cmpl		$3, %r12d
+	vmovapd		%xmm2,  32(%r10)
+	vmovsd		%xmm3,  48(%r10)
+	jl			4f // end
+	vmovapd		%xmm4,  64(%r10)
+	vmovsd		%xmm5,  80(%r10)
+	je			4f // end
+	vmovapd		%xmm6,  96(%r10)
+	vmovsd		%xmm7, 112(%r10)
+
+	jmp		4f
+
+2:
+	// km==3
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,   0(%r10)
+	vmovapd		%xmm1,  16(%r10)
+	jl			4f // end
+	cmpl		$3, %r12d
+	vmovapd		%xmm2,  32(%r10)
+	vmovapd		%xmm3,  48(%r10)
+	jl			4f // end
+	vmovapd		%xmm4,  64(%r10)
+	vmovapd		%xmm5,  80(%r10)
+	je			4f // end
+	vmovapd		%xmm6,  96(%r10)
+	vmovapd		%xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10   <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+	
+	vmovapd		%xmm0,   0(%r10)
+	vmovapd		%xmm1,  16(%r10)
+	vmovsd		32(%r10), %xmm15
+	vmovsd		%xmm15, %xmm2, %xmm2
+	vmovapd		%xmm2,  32(%r10)
+	vmovapd		%xmm3,  48(%r10)
+//	vmovapd		%xmm4,  64(%r10)
+	vmovapd		%xmm5,  80(%r10)
+//	vmovapd		%xmm6,  96(%r10)
+	vmovsd		112(%r10), %xmm15
+	vmovsd		%xmm15, %xmm7, %xmm7
+	vmovapd		%xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl		$2, %r11d
+	jg			1f
+	je			0f
+
+	// km==1
+	vmovsd		%xmm0,  0(%r10)
+
+	jmp		3f
+
+0:
+	// km==2
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,  0(%r10)
+	jl			3f // end
+	vmovsd		32(%r10), %xmm15
+	vmovsd		%xmm15, %xmm2, %xmm2
+	vmovapd		%xmm2, 32(%r10)
+
+	jmp		3f
+
+1:
+	cmpl		$3, %r11d
+	jg			2f
+
+	// km==3
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,   0(%r10)
+	vmovsd		%xmm1,  16(%r10)
+	jl			3f // end
+	cmpl		$3, %r12d
+	vmovsd		32(%r10), %xmm15
+	vmovsd		%xmm15, %xmm2, %xmm2
+	vmovapd		%xmm2,  32(%r10)
+	vmovsd		%xmm3,  48(%r10)
+	jl			3f // end
+//	vmovapd		%xmm4,  64(%r10)
+	vmovsd		%xmm5,  80(%r10)
+
+	jmp		3f
+
+2:
+	// km==3
+	cmpl		$2, %r12d
+	vmovapd		%xmm0,   0(%r10)
+	vmovapd		%xmm1,  16(%r10)
+	jl			3f // end
+	cmpl		$3, %r12d
+	vmovsd		32(%r10), %xmm15
+	vmovsd		%xmm15, %xmm2, %xmm2
+	vmovapd		%xmm2,  32(%r10)
+	vmovapd		%xmm3,  48(%r10)
+	jl			3f // end
+//	vmovapd		%xmm4,  64(%r10)
+	vmovapd		%xmm5,  80(%r10)
+	je			3f // end
+//	vmovapd		%xmm6,  96(%r10)
+	vmovsd		112(%r10), %xmm15
+	vmovsd		%xmm15, %xmm7, %xmm7
+	vmovapd		%xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                    rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10
+	movq	ARG4, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  edi    rsi        rdx        ecx        r8         r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$4, %r11d // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                     edi    rsi        rdx        ecx        r8         r9                  rsp+8   rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                           edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24              rsp+32  rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8  
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                            edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8       r9           rsp+8               rsp+16  rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/kernel/sse3/Makefile b/kernel/sse3/Makefile
new file mode 100644
index 0000000..dbc07d1
--- /dev/null
+++ b/kernel/sse3/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS = 
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+	rm -f *.o
+	rm -f *.s
+
diff --git a/kernel/sse3/kernel_dgemm_4x4_lib4.S b/kernel/sse3/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..26f35b6
--- /dev/null
+++ b/kernel/sse3/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,6235 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp);
+#define EPILOGUE \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	addq	$STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+	subq	$STACKSIZE, %rsp; \
+	movq	%rbx,   (%rsp); \
+	movq	%rbp,  8(%rsp); \
+	movq	%r12, 16(%rsp); \
+	movq	%r13, 24(%rsp); \
+	movq	%r14, 32(%rsp); \
+	movq	%r15, 40(%rsp); \
+	movq	%rdi, 48(%rsp); \
+	movq	%rsi, 56(%rsp); \
+	vmovups	%xmm6, 64(%rsp); \
+	vmovups	%xmm7, 80(%rsp); \
+	vmovups	%xmm8, 96(%rsp); \
+	vmovups	%xmm9, 112(%rsp); \
+	vmovups	%xmm10, 128(%rsp); \
+	vmovups	%xmm11, 144(%rsp); \
+	vmovups	%xmm12, 160(%rsp); \
+	vmovups	%xmm13, 176(%rsp); \
+	vmovups	%xmm14, 192(%rsp); \
+	vmovups	%xmm15, 208(%rsp);
+#define EPILOGUE \
+	movq	  (%rsp), %rbx; \
+	movq	 8(%rsp), %rbp; \
+	movq	16(%rsp), %r12; \
+	movq	24(%rsp), %r13; \
+	movq	32(%rsp), %r14; \
+	movq	40(%rsp), %r15; \
+	movq	48(%rsp), %rdi; \
+	movq	56(%rsp), %rsi; \
+	vmovups	64(%rsp), %xmm6; \
+	vmovups	80(%rsp), %xmm7; \
+	vmovups	96(%rsp), %xmm8; \
+	vmovups	112(%rsp), %xmm9; \
+	vmovups	128(%rsp), %xmm10; \
+	vmovups	144(%rsp), %xmm11; \
+	vmovups	160(%rsp), %xmm12; \
+	vmovups	176(%rsp), %xmm13; \
+	vmovups	192(%rsp), %xmm14; \
+	vmovups	208(%rsp), %xmm15; \
+	addq	$STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	movapd		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+	movapd		0(%r12), %xmm10 // B[0]
+
+	xorpd		%xmm11, %xmm11
+	movapd		%xmm11, %xmm12
+	movapd		%xmm11, %xmm13
+	movapd		%xmm11, %xmm14
+	movapd		%xmm11, %xmm15
+
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	addpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[4]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	addpd		%xmm14, %xmm3
+	movapd		48(%r12), %xmm14 // B[6]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		64(%r12), %xmm10 // B[8]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	addpd		%xmm14, %xmm3
+	movapd		80(%r12), %xmm14 // B[10]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		96(%r12), %xmm10 // B[12]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	addpd		%xmm14, %xmm3
+	movapd		112(%r12), %xmm14 // B[14]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addq		$128, %r12 // B += 16
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	addpd		%xmm10, %xmm1
+	movapd		0(%r12), %xmm10 // B[0]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	cmpl		$4, %r10d
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+	movapd 		16(%r11), %xmm9 // A[2]
+
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	addpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[4]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	addpd		%xmm14, %xmm3
+	movapd		48(%r12), %xmm14 // B[6]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		64(%r12), %xmm10 // B[8]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	addpd		%xmm14, %xmm3
+	movapd		80(%r12), %xmm14 // B[10]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	addpd		%xmm10, %xmm1
+	movapd		96(%r12), %xmm10 // B[12]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	addpd		%xmm14, %xmm3
+	movapd		112(%r12), %xmm14 // B[14]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addq		$128, %r12 // B += 16
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	addpd		%xmm10, %xmm1
+//	movapd		0(%r12), %xmm10 // B[0]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+//	cmpl		$4, %r10d
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+//	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+//	movapd 		16(%r11), %xmm9 // A[2]
+
+
+	// clean accumulators
+	addpd		%xmm14, %xmm3
+	addpd		%xmm11, %xmm7
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+
+	// unroll 0
+	addpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	addpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl	$1, %r10d
+
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq	$32, %r12
+
+	addpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[0]
+	addpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addq	$32, %r11
+
+	addpd		%xmm15, %xmm0
+	addpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+	// clean accumulators
+	addpd		%xmm14, %xmm3
+	addpd		%xmm11, %xmm7
+	addpd		%xmm12, %xmm2
+	addpd		%xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	movapd		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+	movapd		0(%r12), %xmm10 // B[0]
+
+	xorpd		%xmm11, %xmm11
+	movapd		%xmm11, %xmm12
+	movapd		%xmm11, %xmm13
+	movapd		%xmm11, %xmm14
+	movapd		%xmm11, %xmm15
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	// unroll 0
+	subpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[4]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	subpd		%xmm14, %xmm3
+	movapd		48(%r12), %xmm14 // B[6]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		64(%r12), %xmm10 // B[8]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	subpd		%xmm14, %xmm3
+	movapd		80(%r12), %xmm14 // B[10]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		96(%r12), %xmm10 // B[12]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	subpd		%xmm14, %xmm3
+	movapd		112(%r12), %xmm14 // B[14]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addq		$128, %r12 // B += 16
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	subpd		%xmm10, %xmm1
+	movapd		0(%r12), %xmm10 // B[0]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	cmpl		$4, %r10d
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+	movapd 		16(%r11), %xmm9 // A[2]
+
+
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	subpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[4]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	subpd		%xmm14, %xmm3
+	movapd		48(%r12), %xmm14 // B[6]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		64(%r12), %xmm10 // B[8]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	subpd		%xmm14, %xmm3
+	movapd		80(%r12), %xmm14 // B[10]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	subpd		%xmm10, %xmm1
+	movapd		96(%r12), %xmm10 // B[12]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	subpd		%xmm14, %xmm3
+	movapd		112(%r12), %xmm14 // B[14]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addq		$128, %r12 // B += 16
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	subpd		%xmm10, %xmm1
+//	movapd		0(%r12), %xmm10 // B[0]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+//	cmpl		$4, %r10d
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+//	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+//	movapd 		16(%r11), %xmm9 // A[2]
+
+
+	// update accumulators
+	subpd		%xmm14, %xmm3
+	subpd		%xmm11, %xmm7
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+
+	// unroll 0
+	subpd		%xmm14, %xmm3
+	movapd		16(%r12), %xmm14 // B[2]
+	subpd		%xmm11, %xmm7
+	movapd		%xmm10, %xmm11
+	pshufd		$0x4e, %xmm10, %xmm15
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl	$1, %r10d
+
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq	$32, %r12
+
+	subpd		%xmm10, %xmm1
+	movapd		32(%r12), %xmm10 // B[0]
+	subpd		%xmm11, %xmm5
+	movapd		%xmm14, %xmm11
+	pshufd		$0x4e, %xmm14, %xmm12
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addq	$32, %r11
+
+	subpd		%xmm15, %xmm0
+	subpd		%xmm13, %xmm4
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	cmpl	$0, %r10d
+
+	jg		3b // clean up loop 
+
+
+	// update accumulators
+	subpd		%xmm14, %xmm3
+	subpd		%xmm11, %xmm7
+	subpd		%xmm12, %xmm2
+	subpd		%xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- 4*sdb*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13   <- 4*sdb*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	movapd		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	xorpd		%xmm11, %xmm11
+	movapd		%xmm11, %xmm12
+	movapd		%xmm11, %xmm13
+	movapd		%xmm11, %xmm14
+	movapd		%xmm11, %xmm15
+
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+
+	prefetcht0	0(%r12, %r13, 2) // software prefetch
+	prefetcht0	64(%r12, %r13, 2) // software prefetch
+
+	// unroll 0
+	movddup		0(%r12), %xmm10 // B[0]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		32(%r12), %xmm15 // B[4]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		64(%r12), %xmm14 // B[8]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		96(%r12), %xmm12 // B[12]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	movddup		8(%r12), %xmm10 // B[1]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		40(%r12), %xmm15 // B[5]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		72(%r12), %xmm14 // B[9]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		104(%r12), %xmm12 // B[13]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	movddup		16(%r12), %xmm10 // B[2]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	movddup		48(%r12), %xmm15 // B[6]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		80(%r12), %xmm14 // B[10]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		112(%r12), %xmm12 // B[14]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	movddup		24(%r12), %xmm10 // B[3]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		56(%r12), %xmm15 // B[7]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	movddup		88(%r12), %xmm14 // B[11]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		120(%r12), %xmm12 // B[15]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+	movapd 		16(%r11), %xmm9 // A[2]
+	addq		%r13, %r12 // B += ...
+
+
+	cmpl		$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+
+	// unroll 0
+	movddup		0(%r12), %xmm10 // B[0]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		32(%r12), %xmm15 // B[4]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		64(%r12), %xmm14 // B[8]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		96(%r12), %xmm12 // B[12]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		32(%r11), %xmm8 // A[4]
+	mulpd		%xmm9, %xmm13
+	movapd 		48(%r11), %xmm9 // A[6]
+
+
+	// unroll 1
+	movddup		8(%r12), %xmm10 // B[1]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		40(%r12), %xmm15 // B[5]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		72(%r12), %xmm14 // B[9]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		104(%r12), %xmm12 // B[13]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		64(%r11), %xmm8 // A[8]
+	mulpd		%xmm9, %xmm13
+	movapd 		80(%r11), %xmm9 // A[10]
+
+
+	// unroll 2
+	movddup		16(%r12), %xmm10 // B[2]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl		$4, %r10d
+
+	movddup		48(%r12), %xmm15 // B[6]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		80(%r12), %xmm14 // B[10]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		112(%r12), %xmm12 // B[14]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	movapd 		96(%r11), %xmm8 // A[12]
+	mulpd		%xmm9, %xmm13
+	movapd 		112(%r11), %xmm9 // A[14]
+	
+
+	// unroll 3
+	movddup		24(%r12), %xmm10 // B[3]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+
+	movddup		56(%r12), %xmm15 // B[7]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addq		$128, %r11 // A += 16
+
+	movddup		88(%r12), %xmm14 // B[11]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+
+	movddup		120(%r12), %xmm12 // B[15]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+//	movapd 		0(%r11), %xmm8 // A[0]
+	mulpd		%xmm9, %xmm13
+//	movapd 		16(%r11), %xmm9 // A[2]
+	addq		%r13, %r12 // B += ...
+
+
+	// clean accumulators
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+
+	jmp		2f
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+
+	// unroll 0
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	subl	$1, %r10d
+
+	movddup		32(%r12), %xmm15 // B[4]
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+
+	movddup		64(%r12), %xmm14 // B[8]
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addq	$32, %r11
+
+	movddup		96(%r12), %xmm12 // B[12]
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addq	$8, %r12
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+	// clean accumulators
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+	
+	cmpl			$0, %r14d // offset==0
+	jle				2f // end
+
+	cmpl			$0, %r10d // k==0
+	jle				2f // end
+
+	movl			$4, %r15d
+	subl			%r14d, %r15d // 4-offsetB
+	cmpl			%r10d, %r15d
+//	jle				0f
+//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
+
+	movl			%r14d, %eax
+	sall			$3, %eax // offsetB*sizeof(double)
+	addq			%rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		32(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		64(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		96(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	subl			$1, %r10d // k-1
+	subl			$1, %r15d // kend-1
+	addq			$32, %r11 // A+1*bs*sizeof(float)
+	addq			$8, %r12 // B+1*sizeof(float)
+
+	cmpl			$0, %r15d
+	jg				1b
+
+	cmpl			$0, %r10d
+	jle				2f // end
+
+	addq			%r13, %r12
+	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10   <- k
+// r11   <- A
+// r12   <- B
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10   <- k-(4-offB)
+// r11   <- A+(4-offB)*bs*sizeof(double)
+// r12   <- B-offB+bs*sdb*sizeof(double)
+// r13   <- bs*sdb*sizeof(double)
+// r14   <- offB
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// ymm8  <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r14d
+	jg		0f
+
+	// offB==0
+
+	// unroll 0
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	// unroll 1
+	movapd 		32(%r11), %xmm8 // A[0]
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	movddup		8(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		40(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	// unroll 2
+	movapd 		64(%r11), %xmm8 // A[0]
+	movapd 		80(%r11), %xmm9 // A[2]
+
+	movddup		16(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		48(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		80(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	// unroll 3
+	movapd 		96(%r11), %xmm8 // A[0]
+	movapd 		112(%r11), %xmm9 // A[2]
+
+	movddup		24(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		56(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		88(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		120(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	subl	$4, %r10d // k-4
+	addq	$128, %r11 // A+4*bs*sizeof(double)
+	addq	%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+0:
+	cmpl	$1, %r14d
+	jg		1f
+
+	// offB==1
+
+	addq			$8, %r12 // B+1*sizeof(double)
+
+	// unroll 0
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	// unroll 1
+	movapd 		32(%r11), %xmm8 // A[0]
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	movddup		8(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		40(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	// unroll 2
+	movapd 		64(%r11), %xmm8 // A[0]
+	movapd 		80(%r11), %xmm9 // A[2]
+
+	movddup		16(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		48(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		80(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	subl	$3, %r10d // k-3
+	addq	$96, %r11 // A+3*bs*sizeof(double)
+	addq	%r13, %r12
+	subq			$8, %r12 // B+bs*sdb*sizeof(double)-1
+
+	jmp		3f
+
+1:
+	cmpl	$2, %r14d
+	jg		2f
+
+	// offB==2
+
+	addq	$16, %r12 // B+2*sizeof(double)
+
+	// unroll 0
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	// unroll 1
+	movapd 		32(%r11), %xmm8 // A[0]
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	movddup		8(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		40(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	subl	$2, %r10d // k-2
+	addq	$64, %r11 // A+2*bs*sizeof(double)
+	addq	%r13, %r12
+	subq	$16, %r12 // B+bs*sdb*sizeof(double)-2
+
+	// unroll 2
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		32(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		64(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	// unroll 3
+	movapd 		32(%r11), %xmm8 // A[0]
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	movddup		8(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		40(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		72(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		104(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	// unroll 4
+	movapd 		64(%r11), %xmm8 // A[0]
+	movapd 		80(%r11), %xmm9 // A[2]
+
+	movddup		16(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		48(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		80(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		112(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	// unroll 5
+	movapd 		96(%r11), %xmm8 // A[0]
+	movapd 		112(%r11), %xmm9 // A[2]
+
+	movddup		24(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		56(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		88(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		120(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	subl	$4, %r10d // k-4
+	addq	$128, %r11 // A+4*bs*sizeof(double)
+	addq	%r13, %r12 // B+bs*sdb*sizeof(double)
+
+	jmp		3f
+
+2:
+	// offB==3
+
+	addq	$24, %r12 // B+3*sizeof(double)
+
+	// unroll 0
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	subl	$1, %r10d // k-1
+	addq	$32, %r11 // A+1*bs*sizeof(double)
+	addq	%r13, %r12
+	subq	$24, %r12 // B+bs*sdb*sizeof(double)-3
+
+	// unroll 1
+	movapd 		0(%r11), %xmm8 // A[0]
+	movapd 		16(%r11), %xmm9 // A[2]
+
+	movddup		0(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		32(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	// unroll 2
+	movapd 		32(%r11), %xmm8 // A[0]
+	movapd 		48(%r11), %xmm9 // A[2]
+
+	movddup		8(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		40(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		72(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	// unroll 3
+	movapd 		64(%r11), %xmm8 // A[0]
+	movapd 		80(%r11), %xmm9 // A[2]
+
+	movddup		16(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		48(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		80(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		112(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	// unroll 4
+	movapd 		96(%r11), %xmm8 // A[0]
+	movapd 		112(%r11), %xmm9 // A[2]
+
+	movddup		24(%r12), %xmm10 // B[0]
+	movapd		%xmm10, %xmm11
+	mulpd		%xmm8, %xmm10
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm10, %xmm0
+	addpd		%xmm11, %xmm4
+
+	movddup		56(%r12), %xmm15 // B[4]
+	movapd		%xmm15, %xmm13
+	mulpd		%xmm8, %xmm15
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm15, %xmm1
+	addpd		%xmm13, %xmm5
+
+	movddup		88(%r12), %xmm14 // B[8]
+	movapd		%xmm14, %xmm11
+	mulpd		%xmm8, %xmm14
+	mulpd		%xmm9, %xmm11
+	addpd		%xmm14, %xmm2
+	addpd		%xmm11, %xmm6
+
+	movddup		120(%r12), %xmm12 // B[12]
+	movapd		%xmm12, %xmm13
+	mulpd		%xmm8, %xmm12
+	mulpd		%xmm9, %xmm13
+	addpd		%xmm12, %xmm3
+	addpd		%xmm13, %xmm7
+
+	subl	$4, %r10d // k-4
+	addq	$128, %r11 // A+4*bs*sizeof(double)
+	addq	%r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10   <- A
+// r11   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10   <- A+4*4*sizeof(double)
+// r11   <- B+4*4*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+	
+	movapd			0(%r10), %xmm8
+	movapd			16(%r10), %xmm9
+	movddup			0(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+
+	movapd			32(%r10), %xmm8
+	movapd			48(%r10), %xmm9
+	movddup			32(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	movddup			40(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+
+	movapd			64(%r10), %xmm8
+	movapd			80(%r10), %xmm9
+	movddup			64(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	movddup			72(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+	movddup			80(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm2
+	addpd			%xmm13, %xmm6
+
+	movapd			96(%r10), %xmm8
+	movapd			112(%r10), %xmm9
+	movddup			96(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	movddup			104(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+	movddup			112(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm2
+	addpd			%xmm13, %xmm6
+	movddup			120(%r11), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm3
+	addpd			%xmm13, %xmm7
+
+	addq			$128, %r10
+	addq			$128, %r11
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d  <- k
+// r11   <- A
+// r12   <- B
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- max(k-4,0)
+// r11   <- A+4*4*sizeof(double)
+// r12   <- B+4*4*sizeof(double)
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+	
+	movapd			0(%r11), %xmm8
+	movapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	movddup			0(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	addq			$32, %r11
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	movapd			0(%r11), %xmm8
+	movapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	movddup			0(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	addq			$32, %r11
+	movddup			8(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	movapd			0(%r11), %xmm8
+	movapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	movddup			0(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	movddup			8(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+	addq			$32, %r11
+	movddup			16(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm2
+	addpd			%xmm13, %xmm6
+	addq			$32, %r12
+
+	cmpl	$0, %r10d
+	jle		0f
+
+	movapd			0(%r11), %xmm8
+	movapd			16(%r11), %xmm9
+	subl			$1, %r10d
+	movddup			0(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm0
+	addpd			%xmm13, %xmm4
+	movddup			8(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm1
+	addpd			%xmm13, %xmm5
+	movddup			16(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm2
+	addpd			%xmm13, %xmm6
+	addq			$32, %r11
+	movddup			24(%r12), %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm8, %xmm12
+	mulpd			%xmm9, %xmm13
+	addpd			%xmm12, %xmm3
+	addpd			%xmm13, %xmm7
+	addq			$32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+	
+	movapd	%xmm0, %xmm8
+	movsd	%xmm1, %xmm0
+	movsd	%xmm8, %xmm1
+
+	movapd	%xmm2, %xmm8
+	movsd	%xmm3, %xmm2
+	movsd	%xmm8, %xmm3
+
+	movapd	%xmm4, %xmm8
+	movsd	%xmm5, %xmm4
+	movsd	%xmm8, %xmm5
+
+	movapd	%xmm6, %xmm8
+	movsd	%xmm7, %xmm6
+	movsd	%xmm8, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	movddup	0(%r10), %xmm15
+
+	mulpd	%xmm15, %xmm0
+	mulpd	%xmm15, %xmm1
+	mulpd	%xmm15, %xmm2
+	mulpd	%xmm15, %xmm3
+	mulpd	%xmm15, %xmm4
+	mulpd	%xmm15, %xmm5
+	mulpd	%xmm15, %xmm6
+	mulpd	%xmm15, %xmm7
+
+
+	// beta
+	movddup	0(%r11), %xmm14
+
+	movapd		0(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm0
+	movapd		16(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm4
+	movapd		32(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm1
+	movapd		48(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm5
+	movapd		64(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm2
+	movapd		80(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm6
+	movapd		96(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm3
+	movapd		112(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0.0
+//
+// input arguments:
+// r10   <- alpha
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- alpha
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_SCALE_A0_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	movddup	0(%r10), %xmm15
+
+	mulpd	%xmm15, %xmm0
+	mulpd	%xmm15, %xmm1
+	mulpd	%xmm15, %xmm2
+	mulpd	%xmm15, %xmm3
+	mulpd	%xmm15, %xmm4
+	mulpd	%xmm15, %xmm5
+	mulpd	%xmm15, %xmm6
+	mulpd	%xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	movapd	%xmm0, %xmm8
+	movsd	%xmm1, %xmm0
+	movsd	%xmm8, %xmm1
+
+	movapd	%xmm2, %xmm8
+	movsd	%xmm3, %xmm2
+	movsd	%xmm8, %xmm3
+
+	movapd	%xmm4, %xmm8
+	movsd	%xmm5, %xmm4
+	movsd	%xmm8, %xmm5
+
+	movapd	%xmm6, %xmm8
+	movsd	%xmm7, %xmm6
+	movsd	%xmm8, %xmm7
+
+	// alpha
+	movddup	0(%r10), %xmm15
+
+	mulpd	%xmm15, %xmm0
+	mulpd	%xmm15, %xmm1
+	mulpd	%xmm15, %xmm2
+	mulpd	%xmm15, %xmm3
+	mulpd	%xmm15, %xmm4
+	mulpd	%xmm15, %xmm5
+	mulpd	%xmm15, %xmm6
+	mulpd	%xmm15, %xmm7
+
+
+	// beta
+	movddup	0(%r11), %xmm14
+
+	movapd		0(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm0
+	movapd		16(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm4
+	movapd		32(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm1
+	movapd		48(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm5
+	movapd		64(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm2
+	movapd		80(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm6
+	movapd		96(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm3
+	movapd		112(%r12), %xmm15
+	mulpd		%xmm14, %xmm15
+	addpd		%xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10   <- C
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif	
+	
+	movapd	%xmm0, %xmm8
+	movsd	%xmm1, %xmm0
+	movsd	%xmm8, %xmm1
+
+	movapd	%xmm2, %xmm8
+	movsd	%xmm3, %xmm2
+	movsd	%xmm8, %xmm3
+
+	movapd	%xmm4, %xmm8
+	movsd	%xmm5, %xmm4
+	movsd	%xmm8, %xmm5
+
+	movapd	%xmm6, %xmm8
+	movsd	%xmm7, %xmm6
+	movsd	%xmm8, %xmm7
+
+
+	movapd		0(%r10), %xmm15
+	addpd		%xmm15, %xmm0
+	movapd		16(%r10), %xmm15
+	addpd		%xmm15, %xmm4
+	movapd		32(%r10), %xmm15
+	addpd		%xmm15, %xmm1
+	movapd		48(%r10), %xmm15
+	addpd		%xmm15, %xmm5
+	movapd		64(%r10), %xmm15
+	addpd		%xmm15, %xmm2
+	movapd		80(%r10), %xmm15
+	addpd		%xmm15, %xmm6
+	movapd		96(%r10), %xmm15
+	addpd		%xmm15, %xmm3
+	movapd		112(%r10), %xmm15
+	addpd		%xmm15, %xmm7
+
+	ret
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization 
+//
+// input arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- inv_diag_E
+// r11d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+	
+	xorpd			%xmm15, %xmm15 // 0.0
+
+	movsd			%xmm0, %xmm13
+	ucomisd			%xmm15, %xmm13 // d_00 > 0.0 ?
+	jbe				1f
+	sqrtsd			%xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	movsd			.LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+	movsd			LC04(%rip), %xmm12 // 1.0
+#endif
+	divsd			%xmm13, %xmm12
+2:
+	cmpl			$2, %r11d
+	movsd			%xmm12, 0(%r10)
+	movddup			%xmm12, %xmm12
+	mulpd			%xmm12, %xmm0
+	mulpd			%xmm12, %xmm4
+
+	jl				0f // ret
+
+	movapd			%xmm0, %xmm12
+	shufpd			$0x3, %xmm12, %xmm12
+	movapd			%xmm12, %xmm13
+	mulpd			%xmm0, %xmm12
+	mulpd			%xmm4, %xmm13
+	subpd			%xmm12, %xmm1
+	subpd			%xmm13, %xmm5
+	movapd			%xmm1, %xmm13
+	shufpd			$0x3, %xmm13, %xmm13 // 0x1 ???
+	ucomisd			%xmm15, %xmm13 // d_11 > 0.0 ?
+	jbe				3f
+	sqrtsd			%xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	movsd			.LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+	movsd			LC04(%rip), %xmm12 // 1.0
+#endif
+	divsd			%xmm13, %xmm12
+4:
+	cmpl			$3, %r11d
+	movsd			%xmm12, 8(%r10)
+	movddup			%xmm12, %xmm12
+	mulpd			%xmm12, %xmm1
+	mulpd			%xmm12, %xmm5
+
+	jl				0f // ret
+
+	movddup			%xmm4, %xmm12
+	movddup			%xmm5, %xmm13
+	mulpd			%xmm4, %xmm12
+	mulpd			%xmm5, %xmm13
+	subpd			%xmm12, %xmm6
+	subpd			%xmm13, %xmm6
+	movsd			%xmm6, %xmm13
+	ucomisd			%xmm15, %xmm13 // d_22 > 0.0 ?
+	jbe				5f
+	sqrtsd			%xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	movsd			.LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+	movsd			LC04(%rip), %xmm12 // 1.0
+#endif
+	divsd			%xmm13, %xmm12
+6:
+	cmpl			$4, %r11d
+	movsd			%xmm12, 16(%r10)
+	movddup			%xmm12, %xmm12
+	mulpd			%xmm12, %xmm6
+
+	jl				0f // ret
+
+	movapd			%xmm4, %xmm12
+	movapd			%xmm5, %xmm13
+	movapd			%xmm6, %xmm14
+	shufpd			$0x3, %xmm12, %xmm12
+	shufpd			$0x3, %xmm13, %xmm13
+	shufpd			$0x3, %xmm14, %xmm14
+	mulpd			%xmm4, %xmm12
+	mulpd			%xmm5, %xmm13
+	mulpd			%xmm6, %xmm14
+	subpd			%xmm12, %xmm7
+	subpd			%xmm13, %xmm7
+	subpd			%xmm14, %xmm7
+	movapd			%xmm7, %xmm13
+	shufpd			$0x3, %xmm13, %xmm13
+	ucomisd			%xmm15, %xmm13 // d_33 > 0.0 ?
+	jbe				7f
+	sqrtsd			%xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	movsd			.LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+	movsd			LC04(%rip), %xmm12 // 1.0
+#endif
+	divsd			%xmm13, %xmm12
+8:
+	movsd			%xmm12, 24(%r10)
+	movddup			%xmm12, %xmm12
+	mulpd			%xmm12, %xmm7
+
+	jmp		0f
+	
+1:
+	xorpd	%xmm12, %xmm12
+	jmp		2b
+
+3:
+	xorpd	%xmm12, %xmm12
+	jmp		4b
+
+5:
+	xorpd	%xmm12, %xmm12
+	jmp		6b
+
+7:
+	xorpd	%xmm12, %xmm12
+	jmp		8b
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- E
+// r11  <- inv_diag_E
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+	
+	movddup			0(%r11), %xmm13
+	mulpd			%xmm13, %xmm0
+	mulpd			%xmm13, %xmm4
+
+	movddup			8(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm13
+	mulpd			%xmm4, %xmm12
+	subpd			%xmm13, %xmm1
+	subpd			%xmm12, %xmm5
+	movddup			8(%r11), %xmm13
+	mulpd			%xmm13, %xmm1
+	mulpd			%xmm13, %xmm5
+
+	movddup			16(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm12
+	mulpd			%xmm4, %xmm13
+	subpd			%xmm12, %xmm2
+	subpd			%xmm13, %xmm6
+	movddup			48(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm1, %xmm12
+	mulpd			%xmm5, %xmm13
+	subpd			%xmm12, %xmm2
+	subpd			%xmm13, %xmm6
+	movddup			16(%r11), %xmm13
+	mulpd			%xmm13, %xmm2
+	mulpd			%xmm13, %xmm6
+
+	movddup			24(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm12
+	mulpd			%xmm4, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			56(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm1, %xmm12
+	mulpd			%xmm5, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			88(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm2, %xmm12
+	mulpd			%xmm6, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			24(%r11), %xmm13
+	mulpd			%xmm13, %xmm3
+	mulpd			%xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization 
+//
+// input arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// r11  <- inv_diag_D
+// r12d <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+	
+	movddup			0(%r11), %xmm13
+	cmpl			$2, %r12d
+	mulpd			%xmm13, %xmm0
+	mulpd			%xmm13, %xmm4
+
+	jl				0f // ret
+
+	movddup			8(%r10), %xmm13
+	cmpl			$3, %r12d
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm13
+	mulpd			%xmm4, %xmm12
+	subpd			%xmm13, %xmm1
+	subpd			%xmm12, %xmm5
+	movddup			8(%r11), %xmm13
+	mulpd			%xmm13, %xmm1
+	mulpd			%xmm13, %xmm5
+
+	jl				0f // ret
+
+	movddup			16(%r10), %xmm13
+	cmpl			$4, %r12d
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm12
+	mulpd			%xmm4, %xmm13
+	subpd			%xmm12, %xmm2
+	subpd			%xmm13, %xmm6
+	movddup			48(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm1, %xmm12
+	mulpd			%xmm5, %xmm13
+	subpd			%xmm12, %xmm2
+	subpd			%xmm13, %xmm6
+	movddup			16(%r11), %xmm13
+	mulpd			%xmm13, %xmm2
+	mulpd			%xmm13, %xmm6
+
+	jl				0f // ret
+
+	movddup			24(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm0, %xmm12
+	mulpd			%xmm4, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			56(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm1, %xmm12
+	mulpd			%xmm5, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			88(%r10), %xmm13
+	movapd			%xmm13, %xmm12
+	mulpd			%xmm2, %xmm12
+	mulpd			%xmm6, %xmm13
+	subpd			%xmm12, %xmm3
+	subpd			%xmm13, %xmm7
+	movddup			24(%r11), %xmm13
+	mulpd			%xmm13, %xmm3
+	mulpd			%xmm13, %xmm7
+
+0:
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10  <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d01 d11]
+// xmm2  <- [d02 d12]
+// xmm3  <- [d03 d13]
+// xmm4  <- [d20 d30]
+// xmm5  <- [d21 d31]
+// xmm6  <- [d22 d32]
+// xmm7  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	movapd %xmm0,   0(%r10)
+	movapd %xmm4,  16(%r10)
+	movapd %xmm1,  32(%r10)
+	movapd %xmm5,  48(%r10)
+	movapd %xmm2,  64(%r10)
+	movapd %xmm6,  80(%r10)
+	movapd %xmm3,  96(%r10)
+	movapd %xmm7, 112(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl		$2, %r11d
+	jg			1f
+	je			0f
+
+	// km==1
+	movsd		%xmm0,  0(%r10)
+	cmpl		$2, %r12d
+	jl			4f // end
+	movsd		%xmm1, 32(%r10)
+	cmpl		$3, %r12d
+	jl			4f // end
+	movsd		%xmm2, 64(%r10)
+	je			4f // end
+	movsd		%xmm3, 96(%r10)
+
+	jmp		4f
+
+0:
+	// km==2
+	movapd		%xmm0,  0(%r10)
+	cmpl		$2, %r12d
+	jl			4f // end
+	movapd		%xmm1, 32(%r10)
+	cmpl		$3, %r12d
+	jl			4f // end
+	movapd		%xmm2, 64(%r10)
+	je			4f // end
+	movapd		%xmm3, 96(%r10)
+
+	jmp		4f
+
+1:
+	cmpl		$3, %r11d
+	jg			2f
+
+	// km==3
+	movapd		%xmm0,   0(%r10)
+	movsd		%xmm4,  16(%r10)
+	cmpl		$2, %r12d
+	jl			4f // end
+	movapd		%xmm1,  32(%r10)
+	movsd		%xmm5,  48(%r10)
+	cmpl		$3, %r12d
+	jl			4f // end
+	movapd		%xmm2,  64(%r10)
+	movsd		%xmm6,  80(%r10)
+	je			4f // end
+	movapd		%xmm3,  96(%r10)
+	movsd		%xmm7, 112(%r10)
+
+	jmp		4f
+
+2:
+	// km==4
+	movapd		%xmm0,   0(%r10)
+	movapd		%xmm4,  16(%r10)
+	cmpl		$2, %r12d
+	jl			4f // end
+	movapd		%xmm1,  32(%r10)
+	movapd		%xmm5,  48(%r10)
+	cmpl		$3, %r12d
+	jl			4f // end
+	movapd		%xmm2,  64(%r10)
+	movapd		%xmm6,  80(%r10)
+	je			4f // end
+	movapd		%xmm3,  96(%r10)
+	movapd		%xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n0 // col index: start from (inc)
+// rax  <- n1 // col index: up to (exc)
+// rbx  <- dirty
+// xmm0 <-
+//
+// output arguments:
+// r10  <- offset
+// r11  <- D
+// r12  <- 4*sdd*sizeof(double)
+// r13  <- m0 // row index: start from (inc)
+// r14  <- m1 // row index: up to (exc)
+// r15  <- n1-n0
+// rax  <- n1-n0
+// rbx  <- dirty
+// xmm0 <-
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_GEN_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+	
+	// masks computation ???
+
+	// shift D and sol for cols
+	cmpl	$0, %r15d
+	jle		0f
+
+	vmovapd		%xmm1, %xmm0
+	vmovapd		%xmm5, %xmm4
+	vmovapd		%xmm2, %xmm1
+	vmovapd		%xmm6, %xmm5
+	vmovapd		%xmm3, %xmm2
+	vmovapd		%xmm7, %xmm6
+	addq		$32, %r11
+
+	cmpl	$1, %r15d
+	jle		0f
+
+	vmovapd		%xmm1, %xmm0
+	vmovapd		%xmm5, %xmm4
+	vmovapd		%xmm2, %xmm1
+	vmovapd		%xmm6, %xmm5
+	addq		$32, %r11
+
+	cmpl	$2, %r15d
+	jle		0f
+
+	vmovapd		%xmm1, %xmm0
+	vmovapd		%xmm5, %xmm4
+	addq		$32, %r11
+
+0:
+
+	// compute number of cols
+	cmpl	$4, %eax
+	jle		0f
+	movl	$4, %eax
+0:
+	subl	%r15d, %eax
+	movl	%eax, %r15d
+
+
+	cmpl	$0, %r10d
+	jg		0f
+
+	///////////////
+	// offset==0 //
+	///////////////
+
+	cmpl	$0, %r13d
+	jle		4f
+
+	cmpl	$1, %r13d
+	jg		5f
+
+	movsd	0(%r11), %xmm8
+	movsd	%xmm8, %xmm0
+	movsd	32(%r11), %xmm8
+	movsd	%xmm8, %xmm1
+	movsd	64(%r11), %xmm8
+	movsd	%xmm8, %xmm2
+	movsd	96(%r11), %xmm8
+	movsd	%xmm8, %xmm3
+
+	jmp		4f
+
+5:
+
+	cmpl	$2, %r13d
+	jg		5f
+
+	movapd	0(%r11), %xmm0
+	movapd	32(%r11), %xmm1
+	movapd	64(%r11), %xmm2
+	movapd	96(%r11), %xmm3
+
+	jmp		4f
+
+5:
+
+	cmpl	$3, %r13d
+	jg		5f
+
+	movapd	0(%r11), %xmm0
+	movsd	16(%r11), %xmm8
+	movsd	%xmm8, %xmm4
+	movapd	32(%r11), %xmm1
+	movsd	48(%r11), %xmm8
+	movsd	%xmm8, %xmm5
+	movapd	64(%r11), %xmm2
+	movsd	80(%r11), %xmm8
+	movsd	%xmm8, %xmm6
+	movapd	96(%r11), %xmm3
+	movsd	112(%r11), %xmm8
+	movsd	%xmm8, %xmm7
+
+	jmp		4f
+
+5:
+
+	movapd	0(%r11), %xmm0
+	movapd	16(%r11), %xmm4
+	movapd	32(%r11), %xmm1
+	movapd	48(%r11), %xmm5
+	movapd	64(%r11), %xmm2
+	movapd	80(%r11), %xmm6
+	movapd	96(%r11), %xmm3
+	movapd	112(%r11), %xmm7
+
+4:
+	cmpl		$2, %r14d
+	jg			5f
+	je			4f
+
+	// km==1
+	movsd		%xmm0,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	movsd		%xmm1, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	movsd		%xmm2, 64(%r11)
+	je			3f // end
+	movsd		%xmm3, 96(%r11)
+
+	jmp		3f
+
+4:
+	// km==2
+	movapd		%xmm0,  0(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	movapd		%xmm1, 32(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	movapd		%xmm2, 64(%r11)
+	je			3f // end
+	movapd		%xmm3, 96(%r11)
+
+	jmp		3f
+
+5:
+	cmpl		$3, %r14d
+	jg			6f
+
+	// km==3
+	movapd		%xmm0,   0(%r11)
+	movsd		%xmm4,  16(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	movapd		%xmm1,  32(%r11)
+	movsd		%xmm5,  48(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	movapd		%xmm2,  64(%r11)
+	movsd		%xmm6,  80(%r11)
+	je			3f // end
+	movapd		%xmm3,  96(%r11)
+	movsd		%xmm7, 112(%r11)
+
+	jmp		3f
+
+6:
+	// km==4
+	movapd		%xmm0,   0(%r11)
+	movapd		%xmm4,  16(%r11)
+	cmpl		$2, %r15d
+	jl			3f // end
+	movapd		%xmm1,  32(%r11)
+	movapd		%xmm5,  48(%r11)
+	cmpl		$3, %r15d
+	jl			3f // end
+	movapd		%xmm2,  64(%r11)
+	movapd		%xmm6,  80(%r11)
+	je			3f // end
+	movapd		%xmm3,  96(%r11)
+	movapd		%xmm7, 112(%r11)
+
+	jmp		3f
+
+0:
+	
+	movq	%r11, %rbx // D0
+	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+	cmpl	$1, %r10d
+	jg		1f
+
+	///////////////
+	// offset==1 //
+	///////////////
+
+	// TODO
+
+	jmp		3f
+
+1:
+
+	cmpl	$2, %r10d
+	jg		2f
+
+	///////////////
+	// offset==2 //
+	///////////////
+
+	// TODO
+
+	jmp		3f
+
+2:
+
+	///////////////
+	// offset==3 //
+	///////////////
+
+	// TODO
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10   <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+	
+	movapd		%xmm0,   0(%r10)
+	movapd		%xmm4,  16(%r10)
+	movsd		32(%r10), %xmm15
+	movsd		%xmm15, %xmm1
+	movapd		%xmm1,  32(%r10)
+	movapd		%xmm5,  48(%r10)
+//	movapd		%xmm2,  64(%r10)
+	movapd		%xmm6,  80(%r10)
+//	movapd		%xmm3,  96(%r10)
+	movsd		112(%r10), %xmm15
+	movsd		%xmm15, %xmm7
+	movapd		%xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10   <- D
+// r11d   <- km
+// r12d   <- kn
+// xmm0  <- [d00 d10]
+// xmm1  <- [d20 d30]
+// xmm2  <- [d01 d11]
+// xmm3  <- [d21 d31]
+// xmm0  <- [d02 d12]
+// xmm1  <- [d22 d32]
+// xmm2  <- [d03 d13]
+// xmm3  <- [d23 d33]
+// xmm8  <- dirty
+// xmm9  <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_L_4X4_VS_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+	
+	cmpl		$2, %r11d
+	jg			1f
+	je			0f
+
+	// km==1
+	movsd		%xmm0,  0(%r10)
+
+	jmp		3f
+
+0:
+	// km==2
+	cmpl		$2, %r12d
+	movapd		%xmm0,  0(%r10)
+	jl			3f // end
+	movsd		32(%r10), %xmm15
+	movsd		%xmm15, %xmm1
+	movapd		%xmm1, 32(%r10)
+
+	jmp		3f
+
+1:
+	cmpl		$3, %r11d
+	jg			2f
+
+	// km==3
+	cmpl		$2, %r12d
+	movapd		%xmm0,   0(%r10)
+	movsd		%xmm4,  16(%r10)
+	jl			3f // end
+	cmpl		$3, %r12d
+	movsd		32(%r10), %xmm15
+	movsd		%xmm15, %xmm1
+	movapd		%xmm1,  32(%r10)
+	movsd		%xmm5,  48(%r10)
+	jl			3f // end
+//	movapd		%xmm2,  64(%r10)
+	movsd		%xmm6,  80(%r10)
+
+	jmp		3f
+
+2:
+	// km==3
+	cmpl		$2, %r12d
+	movapd		%xmm0,   0(%r10)
+	movapd		%xmm4,  16(%r10)
+	jl			3f // end
+	cmpl		$3, %r12d
+	movsd		32(%r10), %xmm15
+	movsd		%xmm15, %xmm1
+	movapd		%xmm1,  32(%r10)
+	movapd		%xmm5,  48(%r10)
+	jl			3f // end
+//	movapd		%xmm2,  64(%r10)
+	movapd		%xmm6,  80(%r10)
+	je			3f // end
+//	movapd		%xmm3,  96(%r10)
+	movsd		112(%r10), %xmm15
+	movsd		%xmm15, %xmm7
+	movapd		%xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+//                               rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4
+	.def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_vs_lib4
+	.def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+
+//                                   1      2              3          4          5             6            7          8        9            10         11       12      13      14      15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_gen_lib4
+	.def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+#if 0 //
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // offsetC
+	movq	ARG7, %r13 // C
+	movq	ARG8, %r14 // sdc
+	sall	$5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+#else //
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG7, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+#endif //
+
+	// store n gen
+
+	movq	ARG9, %r10 // offsetD
+	movq	ARG10, %r11 // D
+	movq	ARG11, %r12 // sdd
+	sall	$5, %r12d // 4*sdb*sizeof(double)
+	movq	ARG12, %r13 // m0
+	movq	ARG13, %r14 // m1
+	movq	ARG14, %r15 // n0
+	movq	ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+#endif
+
+
+
+
+
+//                               1      2              3          4            5          6        7             8          9
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_lib4
+	.def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4            5          6        7             8          9          10      11
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_nn_4x4_vs_lib4
+	.type kernel_dgemm_nn_4x4_vs_lib4, @function
+kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nn_4x4_vs_lib4
+_kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_nn_4x4_vs_lib4
+	.def kernel_dgemm_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG5, %r12  // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG7, %r11 // beta
+	movq	ARG8, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG9, %r10 // D
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nn_4x4_vs_lib4, .-kernel_dgemm_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                 rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_lib4
+	.def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                    rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11  // A
+	movq	ARG4, %r12  // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend 
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call	inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq	_inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  rdi    rsi            rdx        rcx        r8            r9         rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_lib4
+	.def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG3, %r10
+	movq	ARG4, %r11
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+//                                     rdi    rsi            rdx        rcx        r8            r9         rsp+8     rsp+16   rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+	.def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt after initial triangle
+
+	movq	ARG1, %r10 // k
+	subl	$4, %r10d // k-4
+	movq	ARG3, %r11 // A
+	addq	$128, %r11 // A+4*bs
+	movq	ARG4, %r12 // B
+	addq	$128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender nn
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+	// call inner loader nn
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12   // C
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  edi    rsi        rdx        ecx        r8         r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_lib4
+	.def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movl	$4, %r11d // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                     edi    rsi        rdx        ecx        r8         r9                  rsp+8   rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG6, %r10  // inv_diag_D 
+	movq	ARG8, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG7, %r11 // km 
+	movq	ARG8, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                        edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movl	$4, %r11d
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+//                                           edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24              rsp+32  rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+	.def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// factorization
+
+	movq	ARG9, %r10  // inv_diag_D 
+	movq	ARG11, %r11 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10  // D 
+	movq	ARG10, %r11 // km 
+	movq	ARG11, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                      edi    rsi        rdx        ecx        r8         r9         rsp+8  
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG4, %r10
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                            edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10   // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10   // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+//                                         edi    rsi        rdx        ecx        r8       r9           rsp+8               rsp+16  rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt 
+
+	movq	ARG1, %r10
+	movq	ARG2, %r11
+	movq	ARG3, %r12
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn // TODO scale gen
+
+	movq	ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG6, %r10  // E 
+	movq	ARG7, %r11  // inv_diag_E 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG5, %r10 // D
+	movq	ARG8, %r11 // km 
+	movq	ARG9, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                               edi     rsi         rdx         ecx     r8          r9          rsp+8      rsp+16     rsp+24     rsp+32              rsp+40  rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+	.def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+	// call inner dgemm kernel nt add
+
+	movq	ARG1, %r10 // kp
+	movq	ARG2, %r11  // Ap
+	movq	ARG3, %r12  // Bp
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner dgemm kernel nt sub
+
+	movq	ARG4, %r10 // km
+	movq	ARG5, %r11   // Am
+	movq	ARG6, %r12   // Bm
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blender_loader nn
+
+	movq	ARG7, %r10  // C 
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+	// solve
+
+	movq	ARG9, %r10  // E 
+	movq	ARG10, %r11  // inv_diag_E 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+	// store
+
+	movq	ARG8, %r10 // D 
+	movq	ARG11, %r11 // km 
+	movq	ARG12, %r12 // kn 
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+//                                  1      2              3          4            5          6        7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+	.globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.globl kernel_dtrmm_nn_rl_4x4_lib4
+	.def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+	
+	PROLOGUE
+
+	// zero accumulation registers
+
+	xorpd	%xmm0, %xmm0
+	movapd	%xmm0, %xmm1
+	movapd	%xmm0, %xmm2
+	movapd	%xmm0, %xmm3
+	movapd	%xmm0, %xmm4
+	movapd	%xmm0, %xmm5
+	movapd	%xmm0, %xmm6
+	movapd	%xmm0, %xmm7
+
+
+
+	// initial triangle
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG5, %r12 // B
+	movq	ARG6, %r13 // sdb
+	sall	$5, %r13d // 4*sdb*sizeof(double)
+	movq	ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+	// call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+	// call inner scale
+
+	movq	ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+	INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+	EPILOGUE
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+	// read-only data
+#if defined(OS_LINUX)
+	.section	.rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+	.section	__TEXT,__const
+#elif defined(OS_WINDOWS)
+	.section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+	.align 5
+#endif
+	.quad	-1
+	.quad	-1
+	.quad	-1
+	.quad	-1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1071644672
+	.long	0
+	.long	1073217536
+	.long	0
+	.long	1074003968
+	.long	0
+	.long	1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+	.align 5
+#endif
+	.long	0
+	.long	1074921472
+	.long	0
+	.long	1075183616
+	.long	0
+	.long	1075445760
+	.long	0
+	.long	1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+	.align 5
+#endif
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+	.long	0
+	.long	1072693248
+
+
+
+#if defined(OS_LINUX)
+	.section	.note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+	.subsections_via_symbols
+#endif
+
diff --git a/lib/dummy.txt b/lib/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/lib/dummy.txt
diff --git a/test_problems/CMakeLists.txt b/test_problems/CMakeLists.txt
new file mode 100644
index 0000000..77becb1
--- /dev/null
+++ b/test_problems/CMakeLists.txt
@@ -0,0 +1,32 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of HPIPM.                                                                     #
+#                                                                                                 #
+# HPIPM -- High Performance Interior Point Method.                                                #
+# Copyright (C) 2017 by Gianluca Frison.                                                          #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+add_executable(d_blas test_blas_d.c)
+target_link_libraries(d_blas blasfeo m)
+
+add_executable(s_blas test_blas_s.c)
+target_link_libraries(s_blas blasfeo m)
diff --git a/test_problems/Makefile b/test_problems/Makefile
new file mode 100644
index 0000000..f2e4741
--- /dev/null
+++ b/test_problems/Makefile
@@ -0,0 +1,67 @@
+###################################################################################################
+#                                                                                                 #
+# This file is part of BLASFEO.                                                                   #
+#                                                                                                 #
+# BLASFEO -- BLAS For Embedded Optimization.                                                      #
+# Copyright (C) 2016-2017 by Gianluca Frison.                                                     #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              #
+# All rights reserved.                                                                            #
+#                                                                                                 #
+# HPMPC is free software; you can redistribute it and/or                                          #
+# modify it under the terms of the GNU Lesser General Public                                      #
+# License as published by the Free Software Foundation; either                                    #
+# version 2.1 of the License, or (at your option) any later version.                              #
+#                                                                                                 #
+# HPMPC is distributed in the hope that it will be useful,                                        #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of                                  #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            #
+# See the GNU Lesser General Public License for more details.                                     #
+#                                                                                                 #
+# You should have received a copy of the GNU Lesser General Public                                #
+# License along with HPMPC; if not, write to the Free Software                                    #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  #
+#                                                                                                 #
+# Author: Gianluca Frison, giaf (at) dtu.dk                                                       #
+#                          gianluca.frison (at) imtek.uni-freiburg.de                             #
+#                                                                                                 #
+###################################################################################################
+
+include ../Makefile.rule
+
+ifeq ($(REF_BLAS), 0)
+LIBS = -lm 
+endif
+ifeq ($(REF_BLAS), OPENBLAS)
+LIBS = /opt/openblas/lib/libopenblas.a -pthread -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), BLIS)
+LIBS = /opt/netlib/liblapack.a /opt/blis/lib/libblis.a -lgfortran -lm -fopenmp
+endif
+ifeq ($(REF_BLAS), NETLIB)
+LIBS = /opt/netlib/liblapack.a /opt/netlib/libblas.a -lgfortran -lm
+endif
+ifeq ($(REF_BLAS), MKL)
+LIBS = -Wl,--start-group /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.a /opt/intel/mkl/lib/intel64/libmkl_core.a /opt/intel/mkl/lib/intel64/libmkl_sequential.a -Wl,--end-group -ldl -lpthread -lm
+endif
+ifeq ($(REF_BLAS), ATLAS)
+LIBS = /opt/atlas/lib/liblapack.a /opt/atlas/lib/libcblas.a /opt/atlas/lib/libf77blas.a /opt/atlas/lib/libatlas.a -lgfortran -lm
+endif
+
+#ifneq ($(NUM_THREAD), 1)
+#LIBS += -pthread 
+#endif
+
+OBJS_TEST = test_blas_d.o
+#OBJS_TEST = test_blas_s.o
+#OBJS_TEST = test_d_strmat.o
+#OBJS_TEST = test_s_strmat.o
+#OBJS_TEST = kernel_assembly.o test_assembly.o
+
+obj: $(OBJS_TEST)
+	$(CC) -o test.out $(OBJS_TEST) -L. libblasfeo.a $(LIBS) #-pg
+
+clean:
+	rm -f *.o
+	rm -f test.out
+	rm -f libblasfeo.a
+
diff --git a/test_problems/cpu_freq.h b/test_problems/cpu_freq.h
new file mode 100644
index 0000000..30320fc
--- /dev/null
+++ b/test_problems/cpu_freq.h
@@ -0,0 +1,31 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#ifndef GHZ_MAX
+#define GHZ_MAX 3.6
+#endif
diff --git a/test_problems/kernel_assembly.S b/test_problems/kernel_assembly.S
new file mode 100644
index 0000000..b393e0d
--- /dev/null
+++ b/test_problems/kernel_assembly.S
@@ -0,0 +1,633 @@
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1  %rdi
+#define ARG2  %rsi
+#define ARG3  %rdx
+#define ARG4  %rcx
+#define ARG5  %r8
+#define ARG6  %r9
+#define ARG7  STACKSIZE +  8(%rsp)
+#define ARG8  STACKSIZE + 16(%rsp)
+#define ARG9  STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1  %rcx
+#define ARG2  %rdx
+#define ARG3  %r8
+#define ARG4  %r9
+#define ARG5  STACKSIZE + 40(%rsp)
+#define ARG6  STACKSIZE + 48(%rsp)
+#define ARG7  STACKSIZE + 56(%rsp)
+#define ARG8  STACKSIZE + 64(%rsp)
+#define ARG9  STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.text
+#elif defined(OS_MAC)
+	.section	__TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d   <- k
+// r11   <- A
+// r12   <- B
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d  <- 0
+// r11   <- A+4*k*sizeof(double)
+// r12   <- B+4*k*sizeof(double)
+// ymm0  <- [d00 d11 d22 d33]
+// ymm1  <- [d01 d10 d23 d32]
+// ymm2  <- [d03 d12 d21 d30]
+// ymm3  <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+	.macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+	
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// prefetch
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmovapd 0(%r12), %ymm12 // B[0]
+
+	cmpl	$4, %r10d
+	jle		0f // consider clean-up loop
+
+	// main loop
+	.p2align 3
+1: // main loop
+	
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$4, %r10d
+	jg		1b // main loop 
+
+
+0: // consider clean4-up
+	
+	cmpl	$3, %r10d
+	jle		4f // clean1
+
+	// unroll 0
+	vmovapd 32(%r12), %ymm13 // B[4]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 32(%r11), %ymm10 // A0[4]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	subl	$4, %r10d
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 1
+	vmovapd 64(%r12), %ymm12 // B[8]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 64(%r11), %ymm8 // A0[8]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	// unroll 2
+	vmovapd 96(%r12), %ymm13 // B[12]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	vmovapd 96(%r11), %ymm10 // A0[12]
+
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	addq	$128, %r12
+
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+	addq	$128, %r11
+
+
+	// unroll 3
+//	vmovapd 0(%r12), %ymm12 // B[0]
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+//	vmovapd 0(%r11), %ymm8 // A0[0]
+
+	vmulpd	%ymm10, %ymm13, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+	vmulpd	%ymm10, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+
+//	cmpl	$3, %r10d
+	jmp		2f // return
+
+
+4: // consider clean1-up loop
+
+	cmpl	$0, %r10d
+	jle		2f // return
+
+	// clean-up loop
+3: // clean up loop
+	
+	vmovapd 0(%r12), %ymm12 // B[0]
+	vmovapd 0(%r11), %ymm8 // A0[0]
+	vmulpd	%ymm8, %ymm12, %ymm15
+	vaddpd	%ymm0, %ymm15, %ymm0
+	addq	$32, %r11
+
+	vshufpd $0x5, %ymm12, %ymm12, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm1, %ymm15, %ymm1
+	addq	$32, %r12
+
+	vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm3, %ymm15, %ymm3
+	subl	$1, %r10d
+
+	vshufpd $0x5, %ymm14, %ymm14, %ymm14
+	vmulpd	%ymm8, %ymm14, %ymm15
+	vaddpd	%ymm2, %ymm15, %ymm2
+
+	cmpl	$0, %r10d
+	jg		3b // clean up loop 
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10   <- alpha
+// r11   <- beta
+// r12   <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8  <- dirty
+// ymm9  <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+	.macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+	
+	// alpha
+	vbroadcastsd	0(%r10), %ymm15
+
+	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
+	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
+	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
+	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
+
+	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
+	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
+	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
+	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
+
+	vmulpd		%ymm0, %ymm15, %ymm0
+	vmulpd		%ymm1, %ymm15, %ymm1
+	vmulpd		%ymm2, %ymm15, %ymm2
+	vmulpd		%ymm3, %ymm15, %ymm3
+
+	// beta
+	vbroadcastsd	0(%r11), %ymm14
+
+	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
+
+	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
+	je			0f // end
+
+	vmovupd		0(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm0, %ymm15, %ymm0
+	vmovupd		32(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm1, %ymm15, %ymm1
+	vmovupd		64(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm2, %ymm15, %ymm2
+	vmovupd		96(%r12), %ymm15
+	vmulpd		%ymm15, %ymm14, %ymm15
+	vaddpd		%ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10  <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10  <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+	.macro INNER_STORE_4X4_LIB4
+#else
+	.p2align 4,,15
+#if defined(OS_LINUX)
+	.type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+	.def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+	
+	vmovupd %ymm0,  0(%r10)
+	vmovupd %ymm1, 32(%r10)
+	vmovupd %ymm2, 64(%r10)
+	vmovupd %ymm3, 96(%r10)
+	
+#if MACRO_LEVEL>=1
+	.endm
+#else
+	ret
+
+#if defined(OS_LINUX)
+	.size	inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+//                               1      2              3          4          5             6          7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+	.p2align 4,,15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	.globl kernel_dgemm_nt_4x4_lib4_test
+#if defined(OS_LINUX)
+	.type kernel_dgemm_nt_4x4_lib4_test, @function
+#else // OS_WINDOWS
+	.def kernel_dgemm_nt_4x4_lib4_test; .scl 2; .type 32; .endef
+#endif
+kernel_dgemm_nt_4x4_lib4_test:
+#elif defined(OS_MAC)
+	.globl _kernel_dgemm_nt_4x4_lib4_test
+_kernel_dgemm_nt_4x4_lib4_test:
+#endif
+
+	// prologue
+
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+#if defined(OS_WINDOWS)
+	movq	%rdi, 48(%rsp)
+	movq	%rsi, 56(%rsp)
+	vmovups	%xmm6, 64(%rsp)
+	vmovups	%xmm7, 80(%rsp)
+	vmovups	%xmm8, 96(%rsp)
+	vmovups	%xmm9, 112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+#endif
+
+	vzeroupper
+
+
+	// zero accumulation registers
+
+	vxorpd	%ymm0, %ymm0, %ymm0
+	vmovapd	%ymm0, %ymm1
+	vmovapd	%ymm0, %ymm2
+	vmovapd	%ymm0, %ymm3
+
+
+	// call inner dgemm kernel nt
+
+	movq	ARG1, %r10 // k
+	movq	ARG3, %r11 // A
+	movq	ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+	INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+	// call inner blend scale
+
+	movq	ARG2, %r10 // alpha
+	movq	ARG5, %r11 // beta
+	movq	ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+	INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+	// store n
+
+	movq	ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+	INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+	call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+	callq _inner_store_4x4_lib4
+#endif
+#endif
+
+//	movq	ARG6, %rax
+//	movq	STACKSIZE + 48(%rsp), %rax
+
+
+	// epilogue
+
+	vzeroupper
+
+	movq	  (%rsp), %rbx 
+	movq	 8(%rsp), %rbp
+	movq	16(%rsp), %r12 
+	movq	24(%rsp), %r13 
+	movq	32(%rsp), %r14 
+	movq	40(%rsp), %r15 
+#if defined(OS_WINDOWS)
+	movq	48(%rsp), %rdi
+	movq	56(%rsp), %rsi
+	vmovups	64(%rsp), %xmm6
+	vmovups	80(%rsp), %xmm7
+	vmovups	96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+	addq	$STACKSIZE, %rsp
+
+
+	ret
+
+#if defined(OS_LINUX)
+	.size	kernel_dgemm_nt_4x4_lib4_test, .-kernel_dgemm_nt_4x4_lib4_test
+#endif
+
+
diff --git a/test_problems/results/dummy.txt b/test_problems/results/dummy.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test_problems/results/dummy.txt
diff --git a/test_problems/test_assembly.c b/test_problems/test_assembly.c
new file mode 100644
index 0000000..3a07a13
--- /dev/null
+++ b/test_problems/test_assembly.c
@@ -0,0 +1,59 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+int main()
+	{
+
+	printf("\ntest assembly\n");
+
+	int ii;
+
+	int n = 12;
+
+	double *A; d_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+	d_print_mat(n, n, A, n);
+
+	double *B; d_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+	d_print_mat(n, n, B, n);
+
+	struct d_strmat sA;
+	d_allocate_strmat(n, n, &sA);
+	d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	d_print_strmat(n, n, &sA, 0, 0);
+
+	struct d_strmat sB;
+	d_allocate_strmat(n, n, &sB);
+	d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	struct d_strmat sD;
+	d_allocate_strmat(n, n, &sD);
+
+	struct d_strmat sC;
+	d_allocate_strmat(n, n, &sC);
+
+	double alpha = 1.0;
+	double beta = 0.0;
+	int ret = kernel_dgemm_nt_4x4_lib4_test(n, &alpha, sB.pA, sA.pA, &beta, sB.pA, sD.pA);
+	d_print_strmat(n, n, &sD, 0, 0);
+//	printf("\n%ld %ld\n", (long long) n, ret);
+//	printf("\n%ld %ld\n", (long long) &alpha, ret);
+//	printf("\n%ld %ld\n", (long long) sA.pA, ret);
+//	printf("\n%ld %ld\n", (long long) sB.pA, ret);
+//	printf("\n%ld %ld\n", (long long) &beta, ret);
+//	printf("\n%ld %ld\n", (long long) sC.pA, ret);
+//	printf("\n%ld %ld\n", (long long) sD.pA, ret);
+
+	return 0;
+
+	}
diff --git a/test_problems/test_blas_d.c b/test_problems/test_blas_d.c
new file mode 100644
index 0000000..1e71494
--- /dev/null
+++ b/test_problems/test_blas_d.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//#include <xmmintrin.h> // needed to flush to zero sub-normals with _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); in the main()
+//#endif
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+#ifndef D_PS
+#define D_PS 1
+#endif
+#ifndef D_NC
+#define D_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+	{
+		
+#if defined(REF_BLAS_OPENBLAS)
+	openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+	omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+	mkl_set_num_threads(1);
+#endif
+
+//#if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // flush to zero subnormals !!! works only with one thread !!!
+//#endif
+
+	printf("\n");
+	printf("\n");
+	printf("\n");
+
+	printf("BLAS performance test - double precision\n");
+	printf("\n");
+
+	// maximum frequency of the processor
+	const float GHz_max = GHZ_MAX;
+	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+	printf("\n");
+
+	// maximum flops per cycle, double precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+	const float flops_max = 16;
+	printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	const float flops_max = 8;
+	printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+	const float flops_max = 4;
+	printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+	const float flops_max = 8;
+	printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	const float flops_max = 4;
+	printf("Testing BLAS version for NEONv2 instruction set, 64 bit (optimized for ARM Cortex A57): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+	const float flops_max = 2;
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+	const float flops_max = 2;
+	printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+	
+//	FILE *f;
+//	f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+//	fprintf(f, "C = 'd_x64_intel_haswell';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	fprintf(f, "C = 'd_x64_intel_sandybridge';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+//	fprintf(f, "C = 'd_x64_intel_core';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+//	fprintf(f, "C = 'd_x64_amd_bulldozer';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//	fprintf(f, "C = 'd_armv8a_arm_cortex_a57';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+//	fprintf(f, "C = 'd_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+//	fprintf(f, "C = 'd_generic';\n");
+//	fprintf(f, "\n");
+#endif
+
+//	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+//	fprintf(f, "\n");
+
+//	fprintf(f, "B = [\n");
+	
+
+
+	int i, j, rep, ll;
+	
+	const int bsd = D_PS;
+	const int ncd = D_NC;
+
+/*	int info = 0;*/
+	
+	printf("\nn\t  dgemm_blasfeo\t  dgemm_blas\n");
+	printf("\nn\t Gflops\t    %%\t Gflops\n\n");
+	
+#if 1
+	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+	
+//	for(ll=0; ll<24; ll++)
+	for(ll=0; ll<75; ll++)
+//	for(ll=0; ll<115; ll++)
+//	for(ll=0; ll<120; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = nnrep[ll];
+//		int n = ll+1;
+//		int nrep = nnrep[0];
+//		n = n<12 ? 12 : n;
+//		n = n<8 ? 8 : n;
+
+#else
+	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+	
+	for(ll=0; ll<24; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = 40000; //nnrep[ll];
+#endif
+
+		double *A; d_zeros(&A, n, n);
+		double *B; d_zeros(&B, n, n);
+		double *C; d_zeros(&C, n, n);
+		double *M; d_zeros(&M, n, n);
+
+		char c_n = 'n';
+		char c_l = 'l';
+		char c_r = 'r';
+		char c_t = 't';
+		char c_u = 'u';
+		int i_1 = 1;
+		int i_t;
+		double d_1 = 1;
+		double d_0 = 0;
+	
+		for(i=0; i<n*n; i++)
+			A[i] = i;
+	
+		for(i=0; i<n; i++)
+			B[i*(n+1)] = 1;
+	
+		for(i=0; i<n*n; i++)
+			M[i] = 1;
+	
+		int n2 = n*n;
+		double *B2; d_zeros(&B2, n, n);
+		for(i=0; i<n*n; i++)
+			B2[i] = 1e-15;
+		for(i=0; i<n; i++)
+			B2[i*(n+1)] = 1;
+
+		int pnd = ((n+bsd-1)/bsd)*bsd;	
+		int cnd = ((n+ncd-1)/ncd)*ncd;	
+		int cnd2 = 2*((n+ncd-1)/ncd)*ncd;	
+
+		double *x; d_zeros_align(&x, pnd, 1);
+		double *y; d_zeros_align(&y, pnd, 1);
+		double *x2; d_zeros_align(&x2, pnd, 1);
+		double *y2; d_zeros_align(&y2, pnd, 1);
+		double *diag; d_zeros_align(&diag, pnd, 1);
+		int *ipiv; int_zeros(&ipiv, n, 1);
+
+		for(i=0; i<pnd; i++) x[i] = 1;
+		for(i=0; i<pnd; i++) x2[i] = 1;
+
+		// matrix struct
+#if 0
+		struct d_strmat sA; d_allocate_strmat(n+4, n+4, &sA);
+		struct d_strmat sB; d_allocate_strmat(n+4, n+4, &sB);
+		struct d_strmat sC; d_allocate_strmat(n+4, n+4, &sC);
+		struct d_strmat sD; d_allocate_strmat(n+4, n+4, &sD);
+		struct d_strmat sE; d_allocate_strmat(n+4, n+4, &sE);
+#else
+		struct d_strmat sA; d_allocate_strmat(n, n, &sA);
+		struct d_strmat sB; d_allocate_strmat(n, n, &sB);
+		struct d_strmat sB2; d_allocate_strmat(n, n, &sB2);
+		struct d_strmat sB3; d_allocate_strmat(n, n, &sB3);
+		struct d_strmat sC; d_allocate_strmat(n, n, &sC);
+		struct d_strmat sD; d_allocate_strmat(n, n, &sD);
+		struct d_strmat sE; d_allocate_strmat(n, n, &sE);
+#endif
+		struct d_strvec sx; d_allocate_strvec(n, &sx);
+		struct d_strvec sy; d_allocate_strvec(n, &sy);
+		struct d_strvec sz; d_allocate_strvec(n, &sz);
+
+		d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+		d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+		d_cvt_mat2strmat(n, n, B2, n, &sB2, 0, 0);
+		d_cvt_vec2strvec(n, x, &sx, 0);
+		int ii;
+		for(ii=0; ii<n; ii++)
+			{
+			DMATEL_LIBSTR(&sB3, ii, ii) = 1.0;
+//			DMATEL_LIBSTR(&sB3, n-1, ii) = 1.0;
+			DMATEL_LIBSTR(&sB3, ii, n-1) = 1.0;
+			DVECEL_LIBSTR(&sx, ii) = 1.0;
+			}
+//		d_print_strmat(n, n, &sB3, 0, 0);
+//		if(n==20) return;
+
+		int qr_work_size = 0;//dgeqrf_work_size_libstr(n, n);
+		void *qr_work;
+		v_zeros_align(&qr_work, qr_work_size);
+
+		int lq_work_size = 0;//dgelqf_work_size_libstr(n, n);
+		void *lq_work;
+		v_zeros_align(&lq_work, lq_work_size);
+
+		// create matrix to pivot all the time
+//		dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+		double *dummy;
+
+		int info;
+
+		/* timing */
+		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+		/* warm up */
+		for(rep=0; rep<nrep; rep++)
+			{
+			dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+			}
+
+		double alpha = 1.0;
+		double beta = 0.0;
+
+		gettimeofday(&tv0, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+
+//			dgemm_nt_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+//			dgemm_nn_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 0.0, pC, cnd, pC, cnd);
+//			dsyrk_nt_l_lib(n, n, n, 1.0, pA, cnd, pB, cnd, 1.0, pC, cnd, pD, cnd);
+//			dtrmm_nt_ru_lib(n, n, pA, cnd, pB, cnd, 0, pC, cnd, pD, cnd);
+//			dpotrf_nt_l_lib(n, n, pB, cnd, pD, cnd, diag);
+//			dsyrk_dpotrf_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd, diag);
+//			dsyrk_nt_l_lib(n, n, n, pA, cnd, pA, cnd, 1, pB, cnd, pD, cnd);
+//			dpotrf_nt_l_lib(n, n, pD, cnd, pD, cnd, diag);
+//			dgetrf_nn_nopivot_lib(n, n, pB, cnd, pB, cnd, diag);
+//			dgetrf_nn_lib(n, n, pB, cnd, pB, cnd, diag, ipiv);
+//			dtrsm_nn_ll_one_lib(n, n, pD, cnd, pB, cnd, pB, cnd);
+//			dtrsm_nn_lu_inv_lib(n, n, pD, cnd, diag, pB, cnd, pB, cnd);
+			}
+	
+		gettimeofday(&tv1, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+//			kernel_dgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dsyrk_nt_l_8x8_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_dger4_12_sub_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_12r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_8r_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger12_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+//			kernel_dger8_add_4r_lib4(n, sA.pA, sB.pA, sB.cn, sD.pA);
+//			kernel_dger4_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+//			kernel_dger2_sub_4r_lib4(n, sA.pA, sB.pA, sD.pA);
+//			kernel_dger4_sub_8c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dger4_sub_4c_lib4(n, sA.pA, sA.cn, sB.pA, sD.pA, sD.cn);
+//			kernel_dgemm_nn_4x12_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_dgemm_nn_4x4_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+			dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dsyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dsyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			dpotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			dpotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+//			dgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			dgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+//			dgeqrf_libstr(n, n, &sC, 0, 0, &sD, 0, 0, qr_work);
+//			dcolin_libstr(n, &sx, 0, &sB3, 0, n-1);
+//			dgelqf_libstr(n, n, &sB3, 0, 0, &sB3, 0, 0, lq_work);
+//			dtrmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+//			dtrmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_rltn_libstr(n, n, 1.0, &sB2, 0, 0, &sD, 0, 0, &sD, 0, 0); //
+//			dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			dgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dsymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			dgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+			}
+
+//		d_print_strmat(n, n, &sD, 0, 0);
+
+		gettimeofday(&tv2, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+//			dgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			dpotrf_(&c_l, &n, B2, &n, &info);
+//			dgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+//			dtrmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+//			dgetrf_(&n, &n, B2, &n, ipiv, &info);
+//			dtrsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+//			dtrsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+//			dtrtri_(&c_l, &c_n, &n, B2, &n, &info);
+//			dlauum_(&c_l, &n, B, &n, &info);
+//			dgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+//			dgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+//			dtrmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			dtrsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			dsymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+//			for(i=0; i<n; i++)
+//				{
+//				i_t = n-i;
+//				dcopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+//				}
+//			dsyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+//			dpotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+//			dgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			dgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			dsyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+//			dtrmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+//			dpotrf_(&c_l, &n77, B, &n77, &info);
+//			dtrtri_(&c_l, &c_n, &n77, B, &n77, &info);
+//			dlauum_(&c_l, &n77, B, &n77, &info);
+#endif
+			}
+
+		gettimeofday(&tv3, NULL); // stop
+
+		float Gflops_max = flops_max * GHz_max;
+
+//		float flop_operation = 4*16.0*2*n; // kernel 12x4
+//		float flop_operation = 3*16.0*2*n; // kernel 12x4
+//		float flop_operation = 2*16.0*2*n; // kernel 8x4
+//		float flop_operation = 1*16.0*2*n; // kernel 4x4
+//		float flop_operation = 0.5*16.0*2*n; // kernel 2x4
+
+		float flop_operation = 2.0*n*n*n; // dgemm
+//		float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+//		float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+//		float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+//		float flop_operation = 4.0/3.0*n*n*n; // dgeqrf
+//		float flop_operation = 2.0*n*n; // dgemv dsymv
+//		float flop_operation = 1.0*n*n; // dtrmv dtrsv
+//		float flop_operation = 4.0*n*n; // dgemv_nt
+
+//		float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+		float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+		float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+		float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+		float Gflops_hpmpc    = 1e-9*flop_operation/time_hpmpc;
+		float Gflops_blasfeo  = 1e-9*flop_operation/time_blasfeo;
+		float Gflops_blas     = 1e-9*flop_operation/time_blas;
+
+
+//		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//		fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_hpmpc, 100.0*Gflops_hpmpc/Gflops_max, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+		printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//		fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+
+		d_free(A);
+		d_free(B);
+		d_free(B2);
+		d_free(M);
+		d_free_align(x);
+		d_free_align(y);
+		d_free_align(x2);
+		d_free_align(y2);
+		int_free(ipiv);
+		free(qr_work);
+		free(lq_work);
+		
+		d_free_strmat(&sA);
+		d_free_strmat(&sB);
+		d_free_strmat(&sB2);
+		d_free_strmat(&sB3);
+		d_free_strmat(&sC);
+		d_free_strmat(&sD);
+		d_free_strmat(&sE);
+		d_free_strvec(&sx);
+		d_free_strvec(&sy);
+		d_free_strvec(&sz);
+
+		}
+
+	printf("\n");
+
+//	fprintf(f, "];\n");
+//	fclose(f);
+
+	return 0;
+	
+	}
diff --git a/test_problems/test_blas_s.c b/test_problems/test_blas_s.c
new file mode 100644
index 0000000..3ea9f11
--- /dev/null
+++ b/test_problems/test_blas_s.c
@@ -0,0 +1,454 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+#ifndef S_PS
+#define S_PS 1
+#endif
+#ifndef S_NC
+#define S_NC 1
+#endif
+
+
+
+#if defined(REF_BLAS_OPENBLAS)
+void openblas_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_BLIS)
+void omp_set_num_threads(int num_threads);
+#endif
+#if defined(REF_BLAS_MKL)
+#include "mkl.h"
+#endif
+
+
+
+#include "cpu_freq.h"
+
+
+
+int main()
+	{
+		
+#if defined(REF_BLAS_OPENBLAS)
+	openblas_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_BLIS)
+	omp_set_num_threads(1);
+#endif
+#if defined(REF_BLAS_MKL)
+	mkl_set_num_threads(1);
+#endif
+
+	printf("\n");
+	printf("\n");
+	printf("\n");
+
+	printf("BLAS performance test - float precision\n");
+	printf("\n");
+
+	// maximum frequency of the processor
+	const float GHz_max = GHZ_MAX;
+	printf("Frequency used to compute theoretical peak: %5.1f GHz (edit test_param.h to modify this value).\n", GHz_max);
+	printf("\n");
+
+	// maximum flops per cycle, single precision
+	// maxumum memops (sustained load->store of floats) per cycle, single precision
+#if defined(TARGET_X64_INTEL_HASWELL)
+	const float flops_max = 32; // 2x256 bit fma
+	const float memops_max = 8; // 2x256 bit load + 1x256 bit store
+	printf("Testing BLAS version for AVX2 and FMA instruction sets, 64 bit (optimized for Intel Haswell): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+	const float flops_max = 16; // 1x256 bit mul + 1x256 bit add
+	const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+	printf("Testing BLAS version for AVX instruction set, 64 bit (optimized for Intel Sandy Bridge): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_INTEL_CORE)
+	const float flops_max = 8; // 1x128 bit mul + 1x128 bit add
+	const float memops_max = 4; // 1x128 bit load + 1x128 bit store;
+	printf("Testing BLAS version for SSE3 instruction set, 64 bit (optimized for Intel Core): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+	const float flops_max = 16; // 2x128 bit fma
+	const float memops_max = 4; // 1x256 bit load + 1x128 bit store
+	printf("Testing BLAS version for SSE3 and FMA instruction set, 64 bit (optimized for AMD Bulldozer): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+	const float flops_max = 8; // 1x128 bit fma
+	const float memops_max = 4; // ???
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+	const float flops_max = 8; // 1x128 bit fma
+	const float memops_max = 4; // ???
+	printf("Testing BLAS version for VFPv4 instruction set, 32 bit (optimized for ARM Cortex A15): theoretical peak %5.1f Gflops\n", flops_max*GHz_max);
+#elif defined(TARGET_GENERIC)
+	const float flops_max = 2; // 1x32 bit mul + 1x32 bit add ???
+	const float memops_max = 1; // ???
+	printf("Testing BLAS version for generic scalar instruction set: theoretical peak %5.1f Gflops ???\n", flops_max*GHz_max);
+#endif
+	
+//	FILE *f;
+//	f = fopen("./test_problems/results/test_blas.m", "w"); // a
+
+#if defined(TARGET_X64_INTEL_HASWELL)
+//	fprintf(f, "C = 's_x64_intel_haswell';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
+//	fprintf(f, "C = 's_x64_intel_sandybridge';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_INTEL_CORE)
+//	fprintf(f, "C = 's_x64_intel_core';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_X64_AMD_BULLDOZER)
+//	fprintf(f, "C = 's_x64_amd_bulldozer';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//	fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+//	fprintf(f, "C = 's_armv7a_arm_cortex_a15';\n");
+//	fprintf(f, "\n");
+#elif defined(TARGET_GENERIC)
+//	fprintf(f, "C = 's_generic';\n");
+//	fprintf(f, "\n");
+#endif
+
+//	fprintf(f, "A = [%f %f];\n", GHz_max, flops_max);
+//	fprintf(f, "\n");
+
+//	fprintf(f, "B = [\n");
+	
+
+
+	int i, j, rep, ll;
+	
+	const int bss = S_PS;
+	const int ncs = S_NC;
+
+/*	int info = 0;*/
+	
+	printf("\nn\t  sgemm_blasfeo\t  sgemm_blas\n");
+	printf("\nn\t Gflops\t    %%\t Gflops\t    %%\n\n");
+	
+#if 1
+	int nn[] = {4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300, 304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360, 364, 368, 372, 376, 380, 384, 388, 392, 396, 400, 404, 408, 412, 416, 420, 424, 428, 432, 436, 440, 444, 448, 452, 456, 460, 500, 550, 600, 650, 700};
+	int nnrep[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 400, 400, 400, 400, 400, 200, 200, 200, 200, 200, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4};
+	
+//	for(ll=0; ll<24; ll++)
+	for(ll=0; ll<75; ll++)
+//	for(ll=0; ll<115; ll++)
+//	for(ll=0; ll<120; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = nnrep[ll];
+//		int n = ll+1;
+//		int nrep = nnrep[0];
+//		n = n<16 ? 16 : n;
+
+		int n2 = n*n;
+
+#else
+	int nn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+	
+	for(ll=0; ll<24; ll++)
+
+		{
+
+		int n = nn[ll];
+		int nrep = 40000; //nnrep[ll];
+#endif
+
+		float *A; s_zeros(&A, n, n);
+		float *B; s_zeros(&B, n, n);
+		float *C; s_zeros(&C, n, n);
+		float *M; s_zeros(&M, n, n);
+
+		char c_n = 'n';
+		char c_l = 'l';
+		char c_r = 'r';
+		char c_t = 't';
+		char c_u = 'u';
+		int i_1 = 1;
+		int i_t;
+		float d_1 = 1;
+		float d_0 = 0;
+	
+		for(i=0; i<n*n; i++)
+			A[i] = i;
+	
+		for(i=0; i<n; i++)
+			B[i*(n+1)] = 1;
+	
+		for(i=0; i<n*n; i++)
+			M[i] = 1;
+	
+		float *B2; s_zeros(&B2, n, n);
+		for(i=0; i<n*n; i++)
+			B2[i] = 1e-15;
+		for(i=0; i<n; i++)
+			B2[i*(n+1)] = 1;
+
+		float *x; s_zeros(&x, n, 1);
+		float *y; s_zeros(&y, n, 1);
+		float *x2; s_zeros(&x2, n, 1);
+		float *y2; s_zeros(&y2, n, 1);
+		float *diag; s_zeros(&diag, n, 1);
+		int *ipiv; int_zeros(&ipiv, n, 1);
+
+//		for(i=0; i<n; i++) x[i] = 1;
+//		for(i=0; i<n; i++) x2[i] = 1;
+
+		// matrix struct
+#if 0
+		struct s_strmat sA; s_allocate_strmat(n+4, n+4, &sA);
+		struct s_strmat sB; s_allocate_strmat(n+4, n+4, &sB);
+		struct s_strmat sC; s_allocate_strmat(n+4, n+4, &sC);
+		struct s_strmat sD; s_allocate_strmat(n+4, n+4, &sD);
+		struct s_strmat sE; s_allocate_strmat(n+4, n+4, &sE);
+#else
+		struct s_strmat sA; s_allocate_strmat(n, n, &sA);
+		struct s_strmat sB; s_allocate_strmat(n, n, &sB);
+		struct s_strmat sC; s_allocate_strmat(n, n, &sC);
+		struct s_strmat sD; s_allocate_strmat(n, n, &sD);
+		struct s_strmat sE; s_allocate_strmat(n, n, &sE);
+#endif
+		struct s_strvec sx; s_allocate_strvec(n, &sx);
+		struct s_strvec sy; s_allocate_strvec(n, &sy);
+		struct s_strvec sz; s_allocate_strvec(n, &sz);
+
+		s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+		s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+		s_cvt_vec2strvec(n, x, &sx, 0);
+
+
+		// create matrix to pivot all the time
+//		sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+
+		float *dummy;
+
+		int info;
+
+		/* timing */
+		struct timeval tvm1, tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11, tv12, tv13, tv14, tv15, tv16;
+
+		/* warm up */
+		for(rep=0; rep<nrep; rep++)
+			{
+			sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+			}
+
+		float alpha = 1.0;
+		float beta = 0.0;
+
+		gettimeofday(&tv0, NULL); // stop
+
+		gettimeofday(&tv1, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+//			kernel_sgemm_nt_24x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_16x4_lib8(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_8x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_8x4_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_4x8_gen_lib8(n, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+//			kernel_sgemm_nt_4x8_vs_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 4, 8);
+//			kernel_sgemm_nt_4x8_lib8(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nt_12x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_8x4_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nt_4x4_lib4(n, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nn_16x4_lib8(n, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//			kernel_sgemm_nn_8x8_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+//			kernel_sgemm_nn_8x4_lib8(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sD.pA, sD.pA);
+
+//			sgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			sgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 0.0, &sC, 0, 0, &sD, 0, 0);
+//			spotrf_l_mn_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+			spotrf_l_libstr(n, &sB, 0, 0, &sB, 0, 0);
+//			sgetr_libstr(n, n, &sA, 0, 0, &sB, 0, 0);
+//			sgetrf_nopivot_libstr(n, n, &sB, 0, 0, &sB, 0, 0);
+//			sgetrf_libstr(n, n, &sB, 0, 0, &sB, 0, 0, ipiv);
+//			strmm_rlnn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			strmm_rutn_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//			strsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_rltn_libstr(n, n, 1.0, &sB, 0, 0, &sD, 0, 0, &sD, 0, 0);
+//			strsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			strsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sB, 0, 0);
+//			sgemv_n_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			sgemv_t_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			ssymv_l_libstr(n, n, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sy, 0, &sz, 0);
+//			sgemv_nt_libstr(n, n, 1.0, 1.0, &sA, 0, 0, &sx, 0, &sx, 0, 0.0, 0.0, &sy, 0, &sy, 0, &sz, 0, &sz, 0);
+			}
+
+//		d_print_strmat(n, n, &sD, 0, 0);
+
+		gettimeofday(&tv2, NULL); // stop
+
+		for(rep=0; rep<nrep; rep++)
+			{
+#if defined(REF_BLAS_OPENBLAS) || defined(REF_BLAS_NETLIB) || defined(REF_BLAS_MKL)
+//			sgemm_(&c_n, &c_t, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			sgemm_(&c_n, &c_n, &n, &n, &n, &d_1, A, &n, M, &n, &d_0, C, &n);
+//			scopy_(&n2, A, &i_1, B, &i_1);
+//			ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_0, C, &n);
+//			strmm_(&c_r, &c_u, &c_t, &c_n, &n, &n, &d_1, A, &n, C, &n);
+//			spotrf_(&c_l, &n, B2, &n, &info);
+//			sgetrf_(&n, &n, B2, &n, ipiv, &info);
+//			strsm_(&c_l, &c_l, &c_n, &c_u, &n, &n, &d_1, B2, &n, B, &n);
+//			strsm_(&c_l, &c_u, &c_n, &c_n, &n, &n, &d_1, B2, &n, B, &n);
+//			strtri_(&c_l, &c_n, &n, B2, &n, &info);
+//			slauum_(&c_l, &n, B, &n, &info);
+//			sgemv_(&c_n, &n, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+//			sgemv_(&c_t, &n, &n, &d_1, A, &n, x2, &i_1, &d_0, y2, &i_1);
+//			strmv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			strsv_(&c_l, &c_n, &c_n, &n, B, &n, x, &i_1);
+//			ssymv_(&c_l, &n, &d_1, A, &n, x, &i_1, &d_0, y, &i_1);
+
+//			for(i=0; i<n; i++)
+//				{
+//				i_t = n-i;
+//				scopy_(&i_t, &B[i*(n+1)], &i_1, &C[i*(n+1)], &i_1);
+//				}
+//			ssyrk_(&c_l, &c_n, &n, &n, &d_1, A, &n, &d_1, C, &n);
+//			spotrf_(&c_l, &n, C, &n, &info);
+
+#endif
+
+#if defined(REF_BLAS_BLIS)
+//			sgemm_(&c_n, &c_t, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			sgemm_(&c_n, &c_n, &n77, &n77, &n77, &d_1, A, &n77, B, &n77, &d_0, C, &n77);
+//			ssyrk_(&c_l, &c_n, &n77, &n77, &d_1, A, &n77, &d_0, C, &n77);
+//			strmm_(&c_r, &c_u, &c_t, &c_n, &n77, &n77, &d_1, A, &n77, C, &n77);
+//			spotrf_(&c_l, &n77, B, &n77, &info);
+//			strtri_(&c_l, &c_n, &n77, B, &n77, &info);
+//			slauum_(&c_l, &n77, B, &n77, &info);
+#endif
+			}
+
+		gettimeofday(&tv3, NULL); // stop
+
+		// flops
+		if(1)
+			{
+
+			float Gflops_max = flops_max * GHz_max;
+
+//			float flop_operation = 6*16.0*2*n; // kernel 24x4
+//			float flop_operation = 4*16.0*2*n; // kernel 16x4
+//			float flop_operation = 3*16.0*2*n; // kernel 12x4
+//			float flop_operation = 2*16.0*2*n; // kernel 8x4
+//			float flop_operation = 1*16.0*2*n; // kernel 4x4
+
+//			float flop_operation = 2.0*n*n*n; // dgemm
+//			float flop_operation = 1.0*n*n*n; // dsyrk dtrmm dtrsm
+			float flop_operation = 1.0/3.0*n*n*n; // dpotrf dtrtri
+//			float flop_operation = 2.0/3.0*n*n*n; // dgetrf
+//			float flop_operation = 2.0*n*n; // dgemv dsymv
+//			float flop_operation = 1.0*n*n; // dtrmv dtrsv
+//			float flop_operation = 4.0*n*n; // dgemv_nt
+//			float flop_operation = 3*16.0*2*n; // kernel 12x4
+
+//			float flop_operation = 4.0/3.0*n*n*n; // dsyrk+dpotrf
+
+			float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+			float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+			float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+			float Gflops_hpmpc    = 1e-9*flop_operation/time_hpmpc;
+			float Gflops_blasfeo  = 1e-9*flop_operation/time_blasfeo;
+			float Gflops_blas     = 1e-9*flop_operation/time_blas;
+
+
+			printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+//			fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gflops_blasfeo, 100.0*Gflops_blasfeo/Gflops_max, Gflops_blas, 100.0*Gflops_blas/Gflops_max);
+
+			}
+		// memops
+		else
+			{
+
+			float Gmemops_max = memops_max * GHz_max;
+
+			float memop_operation = 1.0*n*n; // dgecp
+
+			float time_hpmpc    = (float) (tv1.tv_sec-tv0.tv_sec)/(nrep+0.0)+(tv1.tv_usec-tv0.tv_usec)/(nrep*1e6);
+			float time_blasfeo  = (float) (tv2.tv_sec-tv1.tv_sec)/(nrep+0.0)+(tv2.tv_usec-tv1.tv_usec)/(nrep*1e6);
+			float time_blas     = (float) (tv3.tv_sec-tv2.tv_sec)/(nrep+0.0)+(tv3.tv_usec-tv2.tv_usec)/(nrep*1e6);
+
+			float Gmemops_hpmpc    = 1e-9*memop_operation/time_hpmpc;
+			float Gmemops_blasfeo  = 1e-9*memop_operation/time_blasfeo;
+			float Gmemops_blas     = 1e-9*memop_operation/time_blas;
+
+
+			printf("%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+//			fprintf(f, "%d\t%7.2f\t%7.2f\t%7.2f\t%7.2f\n", n, Gmemops_blasfeo, 100.0*Gmemops_blasfeo/Gmemops_max, Gmemops_blas, 100.0*Gmemops_blas/Gmemops_max);
+
+			}
+
+
+		free(A);
+		free(B);
+		free(B2);
+		free(M);
+		free(x);
+		free(y);
+		free(x2);
+		free(y2);
+		free(ipiv);
+		
+		s_free_strmat(&sA);
+		s_free_strmat(&sB);
+		s_free_strmat(&sC);
+		s_free_strmat(&sD);
+		s_free_strmat(&sE);
+		s_free_strvec(&sx);
+		s_free_strvec(&sy);
+		s_free_strvec(&sz);
+
+		}
+
+	printf("\n");
+
+//	fprintf(f, "];\n");
+//	fclose(f);
+
+	return 0;
+	
+	}
+
diff --git a/test_problems/test_d_strmat.c b/test_problems/test_d_strmat.c
new file mode 100644
index 0000000..e06cf84
--- /dev/null
+++ b/test_problems/test_d_strmat.c
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux_ext_dep.h"
+#include "../include/blasfeo_v_aux_ext_dep.h"
+#include "../include/blasfeo_d_aux.h"
+#include "../include/blasfeo_d_kernel.h"
+#include "../include/blasfeo_d_blas.h"
+
+
+int main()
+	{
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+	double *A; d_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	d_print_mat(n, n, A, n);
+
+	double *B; d_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	d_print_mat(n, n, B, n);
+
+	double *C; d_zeros(&C, n, n);
+
+	double *D; d_zeros(&D, n, n);
+	for(ii=0; ii<n*n; ii++) D[ii] = -1;
+
+	double *x_n; d_zeros(&x_n, n, 1);
+//	for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+	x_n[1] = 1.0;
+//	x_n[1] = 1.0;
+//	x_n[2] = 2.0;
+//	x_n[3] = 3.0;
+	double *x_t; d_zeros(&x_t, n, 1);
+//	for(ii=0; ii<n; ii++) x_n[ii] = 1.0;
+	x_t[0] = 1.0;
+	double *y_n; d_zeros(&y_n, n, 1);
+	double *y_t; d_zeros(&y_t, n, 1);
+	double *z_n; d_zeros(&z_n, n, 1);
+	double *z_t; d_zeros(&z_t, n, 1);
+
+	double *x0; d_zeros(&x0, n, 1); x0[0] = 1.0;
+	double *x1; d_zeros(&x1, n, 1); x1[1] = 1.0;
+	double *x2; d_zeros(&x2, n, 1); x2[2] = 1.0;
+	double *x3; d_zeros(&x3, n, 1); x3[3] = 1.0;
+	double *x4; d_zeros(&x4, n, 1); x4[4] = 1.0;
+	double *x5; d_zeros(&x5, n, 1); x5[5] = 1.0;
+	double *x6; d_zeros(&x6, n, 1); x6[6] = 1.0;
+	double *x7; d_zeros(&x7, n, 1); x7[7] = 1.0;
+	double *x8; d_zeros(&x8, n, 1); x8[8] = 1.0;
+	double *x9; d_zeros(&x9, n, 1); x9[9] = 1.0;
+
+	int *ipiv; int_zeros(&ipiv, n, 1);
+
+	//
+	// matrices in matrix struct format
+	//
+	int size_strmat = 5*d_size_strmat(n, n);
+	void *memory_strmat; v_zeros_align(&memory_strmat, size_strmat);
+	char *ptr_memory_strmat = (char *) memory_strmat;
+
+	struct d_strmat sA;
+//	d_allocate_strmat(n, n, &sA);
+	d_create_strmat(n, n, &sA, ptr_memory_strmat);
+	ptr_memory_strmat += sA.memory_size;
+	d_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+//	d_cast_mat2strmat(A, &sA);
+	d_print_strmat(n, n, &sA, 0, 0);
+
+	struct d_strmat sB;
+//	d_allocate_strmat(n, n, &sB);
+	d_create_strmat(n, n, &sB, ptr_memory_strmat);
+	ptr_memory_strmat += sB.memory_size;
+	d_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	struct d_strmat sC;
+//	d_allocate_strmat(n, n, &sC);
+	d_create_strmat(n, n, &sC, ptr_memory_strmat);
+	ptr_memory_strmat += sC.memory_size;
+
+	struct d_strmat sD;
+//	d_allocate_strmat(n, n, &sD);
+	d_create_strmat(n, n, &sD, ptr_memory_strmat);
+	ptr_memory_strmat += sD.memory_size;
+	d_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+	struct d_strmat sE;
+//	d_allocate_strmat(n, n, &sE);
+	d_create_strmat(n, n, &sE, ptr_memory_strmat);
+	ptr_memory_strmat += sE.memory_size;
+
+	struct d_strvec sx_n;
+	d_allocate_strvec(n, &sx_n);
+	d_cvt_vec2strvec(n, x_n, &sx_n, 0);
+
+	struct d_strvec sx_t;
+	d_allocate_strvec(n, &sx_t);
+	d_cvt_vec2strvec(n, x_t, &sx_t, 0);
+
+	struct d_strvec sy_n;
+	d_allocate_strvec(n, &sy_n);
+	d_cvt_vec2strvec(n, y_n, &sy_n, 0);
+
+	struct d_strvec sy_t;
+	d_allocate_strvec(n, &sy_t);
+	d_cvt_vec2strvec(n, y_t, &sy_t, 0);
+
+	struct d_strvec sz_n;
+	d_allocate_strvec(n, &sz_n);
+	d_cvt_vec2strvec(n, z_n, &sz_n, 0);
+
+	struct d_strvec sz_t;
+	d_allocate_strvec(n, &sz_t);
+	d_cvt_vec2strvec(n, z_t, &sz_t, 0);
+
+	struct d_strvec sx0; d_create_strvec(n, &sx0, x0);
+	struct d_strvec sx1; d_create_strvec(n, &sx1, x1);
+	struct d_strvec sx2; d_create_strvec(n, &sx2, x2);
+	struct d_strvec sx3; d_create_strvec(n, &sx3, x3);
+	struct d_strvec sx4; d_create_strvec(n, &sx4, x4);
+	struct d_strvec sx5; d_create_strvec(n, &sx5, x5);
+	struct d_strvec sx6; d_create_strvec(n, &sx6, x6);
+	struct d_strvec sx7; d_create_strvec(n, &sx7, x7);
+	struct d_strvec sx8; d_create_strvec(n, &sx8, x8);
+	struct d_strvec sx9; d_create_strvec(n, &sx9, x9);
+
+	struct d_strvec sz0; d_allocate_strvec(n, &sz0);
+	struct d_strvec sz1; d_allocate_strvec(n, &sz1);
+	struct d_strvec sz2; d_allocate_strvec(n, &sz2);
+	struct d_strvec sz3; d_allocate_strvec(n, &sz3);
+	struct d_strvec sz4; d_allocate_strvec(n, &sz4);
+	struct d_strvec sz5; d_allocate_strvec(n, &sz5);
+	struct d_strvec sz6; d_allocate_strvec(n, &sz6);
+	struct d_strvec sz7; d_allocate_strvec(n, &sz7);
+	struct d_strvec sz8; d_allocate_strvec(n, &sz8);
+	struct d_strvec sz9; d_allocate_strvec(n, &sz9);
+
+	// tests
+	double *v; d_zeros(&v, n, 1);
+	double *vp; d_zeros(&vp, n, 1);
+	double *vm; d_zeros(&vm, n, 1);
+	double *m; d_zeros(&m, n, 1);
+	double *r; d_zeros(&r, n, 1);
+
+	for(ii=0; ii<n; ii++) v[ii] = ii; // x
+	for(ii=0; ii<n; ii++) vp[ii] = 8.0; // upper
+	for(ii=0; ii<n; ii++) vm[ii] = 3.0; // lower
+	for(ii=0; ii<n; ii++) r[ii] = 2*ii+1; // x
+
+	d_print_mat(1, n, v, 1);
+	d_print_mat(1, n, vp, 1);
+	d_print_mat(1, n, vm, 1);
+	d_print_mat(1, n, r, 1);
+
+	struct d_strvec sv; d_create_strvec(n, &sv, v);
+	struct d_strvec svp; d_create_strvec(n, &svp, vp);
+	struct d_strvec svm; d_create_strvec(n, &svm, vm);
+	struct d_strvec sm; d_create_strvec(n, &sm, m);
+	struct d_strvec sr; d_create_strvec(n, &sr, r);
+
+//	d_print_tran_strvec(n, &sv, 0);
+//	d_print_tran_strvec(n, &svp, 0);
+//	d_print_tran_strvec(n, &svm, 0);
+//	d_print_tran_strvec(n, &sm, 0);
+//	d_print_tran_strvec(n, &sr, 0);
+
+//	d_print_tran_strvec(n, &sm, 0);
+//	DVECEL_LIBSTR(&sm, 0) = 0.0;
+//	DVECEL_LIBSTR(&sm, 1) = 1.0;
+//	DVECEL_LIBSTR(&sm, 2) = 2.0;
+//	d_print_tran_strvec(n, &sm, 0);
+//	return 0;
+
+	double alpha = 1.0;
+	double beta = 0.0;
+	kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sA.cn, 0, sD.pA, sE.cn, 1, 3, 1, 3);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+	dtrmm_rlnn_libstr(8, 8, alpha, &sA, 3, 0, &sB, 0, 0, &sD, 0, 0);
+//	dgemm_nn_libstr(8, 8, 8, alpha, &sB, 0, 0, &sA, 1, 0, beta, &sA, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+//	dsyrk_ln_libstr(n, 15, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+//	dpotrf_l_mn_libstr(n, 15, &sD, 0, 0, &sD, 0, 0);
+//	dsyrk_dpotrf_ln_libstr(n, 15, n, &sA, 0, 0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//	dtrmm_rlnn_libstr(n, n, alpha, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+//	dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+//	kernel_dgemm_nt_4x8_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+//	kernel_dgemm_nn_4x8_lib4(n, &alpha, sA.pA, 0, sB.pA, sB.cn, &beta, sC.pA, sD.pA);
+//	kernel_dsyrk_nt_l_4x4_gen_lib4(n, &alpha, sA.pA, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 4, 0, 4);
+//	kernel_dsyrk_nt_l_8x4_gen_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, &beta, 0, sC.pA, sC.cn, 3, sD.pA, sD.cn, 0, 8, 0, 8);
+//	dsyrk_ln_libstr(10, 10, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sC, 0, 0, &sD, 1, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx0, 0, beta, &sz0, 0, &sz0, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx1, 0, beta, &sz1, 0, &sz1, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx2, 0, beta, &sz2, 0, &sz2, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx3, 0, beta, &sz3, 0, &sz3, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx4, 0, beta, &sz4, 0, &sz4, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx5, 0, beta, &sz5, 0, &sz5, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx6, 0, beta, &sz6, 0, &sz6, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx7, 0, beta, &sz7, 0, &sz7, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx8, 0, beta, &sz8, 0, &sz8, 0);
+	dsymv_l_libstr(10, 10, alpha, &sA, 0, 0, &sx9, 0, beta, &sz9, 0, &sz9, 0);
+	d_print_tran_strvec(n, &sz0, 0);
+	d_print_tran_strvec(n, &sz1, 0);
+	d_print_tran_strvec(n, &sz2, 0);
+	d_print_tran_strvec(n, &sz3, 0);
+	d_print_tran_strvec(n, &sz4, 0);
+	d_print_tran_strvec(n, &sz5, 0);
+	d_print_tran_strvec(n, &sz6, 0);
+	d_print_tran_strvec(n, &sz7, 0);
+	d_print_tran_strvec(n, &sz8, 0);
+	d_print_tran_strvec(n, &sz9, 0);
+	return 0;
+
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	dgese_libstr(n, n, 1.0, &sB, 0, 0);
+//	kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+//	kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+	return 0;
+
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	dgese_libstr(n, n, 1.0, &sB, 0, 0);
+//	kernel_dger4_sub_4_lib4(6, sB.pA, sA.pA, sC.pA);
+//	kernel_dger4_sub_4_vs_lib4(6, sB.pA, sA.pA, sC.pA, 1);
+//	kernel_dger4_sub_8_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_8_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 5);
+//	kernel_dger4_sub_12_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_12_vs_lib4(5, sB.pA, sB.cn, sA.pA, sC.pA, sC.cn, 9);
+//	kernel_dger4_sub_8c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+//	kernel_dger4_sub_4c_lib4(9, sB.pA, sA.cn, sA.pA, sC.pA, sC.cn);
+//	d_print_strmat(n, n, &sC, 0, 0);
+//	return 0;
+
+#if 1
+	dgemm_nt_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sC, 0, 0);
+#else
+	dgese_libstr(n, n, 0.1, &sC, 0, 0);
+	DMATEL_LIBSTR(&sC, 0, 0) = 1.0;
+//	DMATEL_LIBSTR(&sC, 0, 1) = 1.0;
+	for(ii=1; ii<n-1; ii++)
+		{
+//		DMATEL_LIBSTR(&sC, ii, ii-1) = 1.0;
+		DMATEL_LIBSTR(&sC, ii, ii) = 1.0;
+//		DMATEL_LIBSTR(&sC, ii, ii+1) = 1.0;
+		}
+//	DMATEL_LIBSTR(&sC, n-1, n-2) = 1.0;
+	DMATEL_LIBSTR(&sC, n-1, n-1) = 1.0;
+#endif
+	d_print_strmat(n, n, &sC, 0, 0);
+	dgese_libstr(n, n, 0.0/0.0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	dgein1_libstr(12.0, &sA, 0, 0);
+//	DMATEL_LIBSTR(&sA, 0, 0) =   12.0;
+//	DMATEL_LIBSTR(&sA, 1, 0) =    6.0;
+//	DMATEL_LIBSTR(&sA, 2, 0) = -  4.0;
+//	DMATEL_LIBSTR(&sA, 0, 1) = - 51.0;
+//	DMATEL_LIBSTR(&sA, 1, 1) =  167.0;
+//	DMATEL_LIBSTR(&sA, 2, 1) =   24.0;
+//	DMATEL_LIBSTR(&sA, 0, 2) =    4.0;
+//	DMATEL_LIBSTR(&sA, 1, 2) = - 68.0;
+//	DMATEL_LIBSTR(&sA, 2, 2) = - 41.0;
+//	d_print_strmat(n, n, &sA, 0, 0);
+	d_print_strmat(n, n, &sC, 0, 0);
+//	printf("\n%f\n", DGEEL_LIBSTR(&sA, 0, 0));
+//	int qr_work_size = dgeqrf_work_size_libstr(n, n);
+	int qr_work_size = dgelqf_work_size_libstr(n, n);
+	void *qr_work;
+	v_zeros_align(&qr_work, qr_work_size);
+//	dgeqrf_libstr(10, 10, &sC, 0, 0, &sD, 0, 0, qr_work);
+	dgelqf_libstr(17, 17, &sC, 0, 0, &sD, 0, 0, qr_work);
+//	dgecp_libstr(10, 10, &sC, 0, 0, &sD, 0, 0);
+//	kernel_dgeqrf_4_lib4(16, 12, sD.pA, sD.cn, sD.dA, qr_work);
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	kernel_dgeqrf_vs_lib4(10, 16, 0, sD.pA+0, sD.cn, sD.dA);
+//	kernel_dgelqf_vs_lib4(10, 10, 10, 0, sD.pA+0, sD.cn, sD.dA);
+	d_print_strmat(n, n, &sD, 0, 0);
+	free(qr_work);
+	return 0;
+
+//	dveccl_mask_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0, &sm, 0);
+//	veccl_libstr(n, &svm, 0, &sv, 0, &svp, 0, &sv, 0);
+//	d_print_tran_strvec(12, &sv, 0);
+//	d_print_tran_strvec(12, &sm, 0);
+//	dvecze_libstr(n, &sm, 0, &sr, 0, &sr, 0);
+//	d_print_tran_strvec(12, &sr, 0);
+//	return 0;
+
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx0, 0, &sz0, 0);
+//	d_print_tran_strvec(n, &sz0, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx1, 0, &sz1, 0);
+//	d_print_tran_strvec(n, &sz1, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx2, 0, &sz2, 0);
+//	d_print_tran_strvec(n, &sz2, 0);
+//	dtrsv_unn_libstr(n, &sA, 1, 0, &sx3, 0, &sz3, 0);
+//	d_print_tran_strvec(n, &sz3, 0);
+//	return 0;
+
+//	double alpha = 1.0;
+//	double beta = 1.0;
+//	kernel_dgemm_nt_4x12_vs_lib4(n, &alpha, sA.pA, sB.pA, sB.cn, &beta, sD.pA, sD.pA, 3, 10);
+//	kernel_dgemm_nt_8x8u_vs_lib4(n, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn, 7, 6);
+	dgemm_nn_libstr(n, n, n, 1.0, &sA, 0, 0, &sA, 0, 0, 1.0, &sB, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	dpotrf_l_libstr(16, &sD, 0, 0, &sD, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;;
+
+//	dmatse_libstr(n, n, 100.0, &sD, 0, 0);
+
+//	for(ii=0; ii<n; ii++)
+//		dvecin1_libstr(ii+1, &sx_n, ii);
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	// ddiain_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+//	ddiaad_libstr(4, -1.0, &sx_n, 1, &sD, 3, 2);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	dgemm_l_diag_libstr(n, n, 1.0, &sx_n, 0, &sA, 0, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	dgemm_r_diag_libstr(n, n, 1.0, &sA, 0, 0, &sx_n, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	exit(1);
+
+//	dsetmat_libstr(n, n, 0.0, &sD, 0, 0);
+//	dmatin1_libstr(2.0, &sD, 0, 0);
+//	dmatin1_libstr(2.0, &sD, 1, 1);
+//	dmatin1_libstr(2.0, &sD, 2, 2);
+//	dmatin1_libstr(1.0, &sD, 1, 0);
+//	dmatin1_libstr(1.0, &sD, 2, 1);
+//	dmatin1_libstr(0.5, &sD, 2, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	d_print_tran_strvec(n, &sx_n, 0);
+//	dtrsv_lnn_libstr(n, n, &sD, 0, 0, &sx_n, 0, &sz_n, 0);
+//	d_print_tran_strvec(n, &sz_n, 0);
+//	exit(1);
+
+//	dgemm_nt_libstr(8, 8, 8, 1.0, &sB, 0, 0, &sA, 1, 0, 0.0, &sD, 0, 0, &sD, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+
+//	double alpha = 1.0;
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA, 2, sA.pA, sA.cn, 1, sD.pA, sD.cn, 0, 4, 1, 4);
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(7, &alpha, sB.pA+sB.cn*4, 2, sA.pA, sA.cn, 1, sD.pA+sD.cn*4, sD.cn, 0, 4, 1, 4);
+//	kernel_dtrmm_nn_rl_4x4_lib4(4, &alpha, sB.pA, sA.pA, sA.cn+4*4, sD.pA+4*4);
+//	kernel_dtrmm_nn_rl_4x4_gen_lib4(3, &alpha, sB.pA+sB.cn*4+4*4, 2, sA.pA+sB.cn*4+4*4, sA.cn, 1, sD.pA+sD.cn*4+4*4, sD.cn, 0, 4, 0, 4);
+	dtrmm_rlnn_libstr(8, 8, 1.0, &sB, 0, 0, &sA, 3, 0, &sD, 2, 1);
+	d_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx0, 0, &sx0, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx1, 0, &sx1, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx2, 0, &sx2, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx3, 0, &sx3, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx4, 0, &sx4, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx5, 0, &sx5, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx6, 0, &sx6, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx7, 0, &sx7, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx8, 0, &sx8, 0);
+	dtrmv_lnn_libstr(8, 8, &sA, 0, 0, &sx9, 0, &sx9, 0);
+	d_print_tran_strvec(n, &sx0, 0);
+	d_print_tran_strvec(n, &sx1, 0);
+	d_print_tran_strvec(n, &sx2, 0);
+	d_print_tran_strvec(n, &sx3, 0);
+	d_print_tran_strvec(n, &sx4, 0);
+	d_print_tran_strvec(n, &sx5, 0);
+	d_print_tran_strvec(n, &sx6, 0);
+	d_print_tran_strvec(n, &sx7, 0);
+	d_print_tran_strvec(n, &sx8, 0);
+	d_print_tran_strvec(n, &sx9, 0);
+	return 0;
+
+	dgemv_t_libstr(2, 8, 1.0, &sA, 2, 0, &sx_n, 0, 0.0, &sy_n, 0, &sz_n, 0);
+	d_print_tran_strvec(n, &sz_n, 0);
+	return 0;
+
+	dgemm_nt_libstr(4, 8, 8, 1.0, &sB, 0, 0, &sA, 0, 0, 0.0, &sB, 0, 0, &sD, 3, 0);
+//	d_print_strmat(n, n, &sB, 0, 0);
+	d_print_strmat(n, n, &sD, 0, 0);
+	exit(1);
+
+	dpotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+//	dgetrf_nopivot_libstr(n, n, &sD, 0, 0, &sD, 0, 0);
+//	dgetrf_libstr(n, n, &sD, 0, 0, &sD, 0, 0, ipiv);
+	d_print_strmat(n, n, &sD, 0, 0);
+#if defined(LA_HIGH_PERFORMANCE) | defined(LA_REFERENCE)
+	d_print_mat(1, n, sD.dA, 1);
+#endif
+	int_print_mat(1, n, ipiv, 1);
+	dtrsm_rltn_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	exit(1);
+
+#if 1 // solve P L U X = P B
+	d_print_strmat(n, n, &sB, 0, 0);
+	drowpe_libstr(n, ipiv, &sB);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	dtrsm_llnu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	dtrsm_lunn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+#else // solve X^T (P L U)^T = B^T P^T
+	d_print_strmat(n, n, &sB, 0, 0);
+	dcolpe_libstr(n, ipiv, &sB);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	dtrsm_rltu_libstr(n, n, 1.0, &sD, 0, 0, &sB, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+	dtrsm_rutn_libstr(n, n, 1.0, &sD, 0, 0, &sE, 0, 0, &sE, 0, 0);
+	d_print_strmat(n, n, &sE, 0, 0);
+#endif
+
+//	d_print_strmat(n, n, &sA, 0, 0);
+//	d_print_strmat(n, n, &sB, 0, 0);
+//	d_print_strmat(n, n, &sD, 0, 0);
+//	d_print_strmat(n, n, &sE, 0, 0);
+
+//	d_cvt_strmat2mat(n, n, &sE, 0, 0, C, n);
+//	d_print_mat(n, n, C, n);
+
+	dtrtr_u_libstr(6, &sE, 2, 0, &sB, 1, 0);
+	d_print_strmat(n, n, &sB, 0, 0);
+
+	d_print_strmat(n, n, &sA, 0, 0);
+	dgemv_nt_libstr(6, n, 1.0, 1.0, &sA, 0, 0, &sx_n, 0, &sx_t, 0, 0.0, 0.0, &sy_n, 0, &sy_t, 0, &sz_n, 0, &sz_t, 0);
+//	dsymv_l_libstr(5, 5, 1.0, &sA, 0, 0, x_n, 0.0, y_n, z_n);
+	d_print_mat(1, n, z_n, 1);
+	d_print_mat(1, n, z_t, 1);
+
+
+
+
+//	for(ii=0; ii<sE.pm*sE.cn; ii++) sE.pA[ii] = 0.0;
+//	double alpha = 0.0;
+//	double beta = 1.0;
+//	kernel_dgemm_nt_4x4_gen_lib4(4, &alpha, sA.pA, sB.pA, &beta, 3, sA.pA, sA.cn, 0, sE.pA, sE.cn, 0, 4, 2, 2);
+//	d_print_strmat(n, n, &sE, 0, 0);
+
+	// free memory
+	free(A);
+	free(B);
+	free(C);
+	free(D);
+	free(ipiv);
+//	d_free_strmat(&sA);
+//	d_free_strmat(&sB);
+//	d_free_strmat(&sD);
+	v_free_align(memory_strmat);
+
+	return 0;
+
+	}
diff --git a/test_problems/test_s_strmat.c b/test_problems/test_s_strmat.c
new file mode 100644
index 0000000..456db87
--- /dev/null
+++ b/test_problems/test_s_strmat.c
@@ -0,0 +1,191 @@
+/**************************************************************************************************
+*                                                                                                 *
+* This file is part of BLASFEO.                                                                   *
+*                                                                                                 *
+* BLASFEO -- BLAS For Embedded Optimization.                                                      *
+* Copyright (C) 2016-2017 by Gianluca Frison.                                                     *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
+* All rights reserved.                                                                            *
+*                                                                                                 *
+* HPMPC is free software; you can redistribute it and/or                                          *
+* modify it under the terms of the GNU Lesser General Public                                      *
+* License as published by the Free Software Foundation; either                                    *
+* version 2.1 of the License, or (at your option) any later version.                              *
+*                                                                                                 *
+* HPMPC is distributed in the hope that it will be useful,                                        *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of                                  *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                            *
+* See the GNU Lesser General Public License for more details.                                     *
+*                                                                                                 *
+* You should have received a copy of the GNU Lesser General Public                                *
+* License along with HPMPC; if not, write to the Free Software                                    *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA                  *
+*                                                                                                 *
+* Author: Gianluca Frison, giaf (at) dtu.dk                                                       *
+*                          gianluca.frison (at) imtek.uni-freiburg.de                             *
+*                                                                                                 *
+**************************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "../include/blasfeo_common.h"
+#include "../include/blasfeo_i_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux_ext_dep.h"
+#include "../include/blasfeo_s_aux.h"
+#include "../include/blasfeo_s_kernel.h"
+#include "../include/blasfeo_s_blas.h"
+
+
+int main()
+	{
+
+#if defined(LA_HIGH_PERFORMANCE)
+
+	printf("\nLA provided by HIGH_PERFORMANCE\n\n");
+
+#elif defined(LA_REFERENCE)
+
+	printf("\nLA provided by REFERENCE\n\n");
+
+#elif defined(LA_BLAS)
+
+	printf("\nLA provided by BLAS\n\n");
+
+#else
+
+	printf("\nLA provided by ???\n\n");
+	exit(2);
+
+#endif
+
+	int ii, jj;
+
+	int n = 16;
+
+	//
+	// matrices in column-major format
+	//
+	float *A; s_zeros(&A, n, n);
+	for(ii=0; ii<n*n; ii++) A[ii] = ii;
+//	for(jj=0; jj<n; jj++)
+//		for(ii=0; ii<jj; ii++)
+//			A[ii+n*jj] = 0.0/0.0;
+//	s_print_mat(n, n, A, n);
+
+	float *B; s_zeros(&B, n, n);
+	for(ii=0; ii<n; ii++) B[ii*(n+1)] = 1.0;
+//	s_print_mat(n, n, B, n);
+
+	float *D; s_zeros(&D, n, n);
+	for(ii=0; ii<n*n; ii++) D[ii] = -1.0;
+//	s_print_mat(n, n, B, n);
+
+
+	//
+	// matrices in matrix struct format
+	//
+
+	struct s_strmat sA;
+	s_allocate_strmat(n, n, &sA);
+	s_cvt_mat2strmat(n, n, A, n, &sA, 0, 0);
+	s_print_strmat(n, n, &sA, 0, 0);
+
+	struct s_strmat sB;
+	s_allocate_strmat(n, n, &sB);
+	s_cvt_mat2strmat(n, n, B, n, &sB, 0, 0);
+	s_print_strmat(n, n, &sB, 0, 0);
+
+	struct s_strmat sD;
+	s_allocate_strmat(n, n, &sD);
+	s_cvt_mat2strmat(n, n, D, n, &sD, 0, 0);
+
+	struct s_strvec sx;
+	s_allocate_strvec(n, &sx);
+	sx.pa[7] = 1.0;
+	s_print_tran_strvec(n, &sx, 0);
+
+	struct s_strvec sz0;
+	s_allocate_strvec(n, &sz0);
+
+	struct s_strvec sz1;
+	s_allocate_strvec(n, &sz1);
+
+	//
+	// tests
+	//
+
+	float alpha = 1.0;
+	float beta = 0.0;
+//	kernel_sgemm_nt_24x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_16x4_lib8(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x8_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//	kernel_sgemm_nt_8x4_lib8(5, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA);
+//	kernel_sgemm_nt_4x8_gen_lib8(8, &alpha, sA.pA, sB.pA, &beta, 0, sD.pA, sD.cn, 0, sD.pA, sD.cn, 0, 4, 0, 8);
+//	kernel_sgemm_nt_8x4_vs_lib8(8, &alpha, sA.pA, sB.pA, &beta, sD.pA, sD.pA, 7, 4);
+//	kernel_sgemm_nt_8x4_lib8(8, &alpha, sB.pA, sA.pA+4, &beta, sA.pA+4*8, sD.pA+4*8);
+//	kernel_sgemm_nn_16x4_lib8(4, &alpha, sA.pA, sA.cn, 0, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_12x4_lib4(4, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x8_lib4(8, &alpha, sA.pA, sA.cn, sB.pA, sB.cn, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	kernel_sgemm_nt_8x4_lib4(2, &alpha, sA.pA, sA.cn, sB.pA, &beta, sD.pA, sD.cn, sD.pA, sD.cn);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	return 0;
+//	sgemm_nt_libstr(n, n, 5, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	ssyrk_ln_libstr(n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	ssyrk_ln_mn_libstr(n, n, n, 1.0, &sA, 0, 0, &sB, 0, 0, 0.0, &sB, 0, 0, &sD, 0, 0);
+//	kernel_ssyrk_nt_l_8x8_lib8(n, &alpha, sA.pA, sA.pA, &beta, sB.pA, sD.pA);
+//	sgecp_libstr(16, 16, &sA, 2, 0, &sD, 1, 0);
+//	sgetr_libstr(16, 16, &sA, 2, 0, &sD, 2, 0);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	sgemv_n_libstr(6, 6, 1.0, &sA, 1, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+//	sgemv_t_libstr(11, 8, 1.0, &sA, 0, 0, &sx, 0, 0.0, &sz0, 0, &sz0, 0);
+//	strmv_lnn_libstr(6, 6, &sA, 1, 0, &sx, 0, &sz0, 0);
+//	strmv_ltn_libstr(10, 10, &sA, 1, 0, &sx, 0, &sz0, 0);
+//	sA.pA[0] = 1.0;
+//	strsv_lnn_libstr(10, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	for(ii=0; ii<8; ii++) sA.dA[ii] = 1.0/sgeex1_libstr(&sA, ii, ii);
+//	kernel_strsv_lt_inv_8_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa);
+//	kernel_strsv_lt_inv_8_vs_lib8(0, sA.pA, sA.cn, sA.dA, sx.pa, sx.pa, sz0.pa, 3);
+//	s_print_strmat(n, n, &sA, 0, 0);
+//	strsv_ltn_libstr(12, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	strsv_ltn_mn_libstr(11, 3, &sA, 0, 0, &sx, 0, &sz0, 0);
+//	s_print_strmat(n, n, &sA, 0, 0);
+//	kernel_sgemv_nt_4_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa);
+//	kernel_sgemv_nt_4_vs_lib8(n, &alpha, &alpha, sA.pA, sA.cn, sx.pa, sx.pa, &beta, sz1.pa, sz0.pa, sz1.pa, 3);
+//	sgemv_nt_libstr(5, 2, alpha, alpha, &sA, 0, 0, &sx, 0, &sx, 0, beta, beta, &sz0, 0, &sz1, 0, &sz0, 0, &sz1, 0);
+//	ssymv_l_libstr(10, 10, alpha, &sA, 1, 0, &sx, 0, beta, &sz0, 0, &sz1, 0);
+//	s_print_tran_strvec(n, &sz0, 0);
+//	s_print_tran_strvec(n, &sz1, 0);
+//	return 0;
+//	sgesc_libstr(16, 9, 2.0, &sD, 0, 0);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	kernel_spotrf_nt_l_8x8_lib8(0, sD.pA, sD.pA, sD.pA, sD.pA, sx.pa);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	s_print_tran_strvec(n, &sx, 0);
+//	kernel_strsm_nt_rl_inv_8x8_lib8(0, sD.pA, sD.pA, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA, sx.pa);
+//	s_print_strmat(n, n, &sD, 0, 0);
+//	kernel_spotrf_nt_l_8x8_lib8(8, sD.pA+8*sD.cn, sD.pA+8*sD.cn, sD.pA+8*sD.cn+8*8, sD.pA+8*sD.cn+8*8, sx.pa+8);
+//	spotrf_l_mn_libstr(23, 17, &sD, 0, 0, &sD, 0, 0);
+//	spotrf_l_libstr(n, &sD, 0, 0, &sD, 0, 0);
+//	kernel_strmm_nn_rl_8x4_lib8(3, &alpha, sB.pA, 7, sA.pA, sA.cn, sD.pA);
+	strmm_rlnn_libstr(12, 8, 1.0, &sA, 0, 0, &sB, 0, 0, &sD, 0, 0);
+	s_print_strmat(n, n, &sD, 0, 0);
+	return 0;
+
+
+
+	//
+	// free memory
+	//
+
+	free(A);
+	free(B);
+	free(D);
+	s_free_strmat(&sA);
+	s_free_strmat(&sB);
+	s_free_strmat(&sD);
+
+	return 0;
+	
+	}